def data_stat(query): questions = geoserver_interface.download_questions(query) syntax_parses = questions_to_syntax_parses(questions, parser=False) annotations = geoserver_interface.download_semantics(query) unary_rules = [] binary_rules = [] semantic_trees = [] for pk, local_syntax_parses in syntax_parses.iteritems(): print pk for number, syntax_parse in local_syntax_parses.iteritems(): local_semantic_trees = [annotation_to_semantic_tree(syntax_parse, annotation) for annotation in annotations[pk][number].values()] semantic_trees.extend(local_semantic_trees) print local_semantic_trees for semantic_tree in local_semantic_trees: unary_rules.extend(semantic_tree.get_unary_rules()) binary_rules.extend(semantic_tree.get_binary_rules()) tag_model = train_tag_model(syntax_parses, annotations) print "sentences: %d" % sum(len(question.sentence_words) for _, question in questions.iteritems()) print "words: %d" % (sum(len(words) for _, question in questions.iteritems() for _, words in question.sentence_words.iteritems())) print "literals: %d" % len(semantic_trees) print "unary rules: %d" % len(unary_rules) print "binary rules: %d" % len(binary_rules) print "" print "LEXICON" for key, s in tag_model.lexicon.iteritems(): print "%s: %s" % ("_".join(key), ", ".join(" ".join(ss) for ss in s))
def data_stat(query): questions = geoserver_interface.download_questions(query) syntax_parses = questions_to_syntax_parses(questions, parser=False) annotations = geoserver_interface.download_semantics(query) unary_rules = [] binary_rules = [] semantic_trees = [] for pk, local_syntax_parses in syntax_parses.iteritems(): print pk for number, syntax_parse in local_syntax_parses.iteritems(): local_semantic_trees = [ annotation_to_semantic_tree(syntax_parse, annotation) for annotation in annotations[pk][number].values() ] semantic_trees.extend(local_semantic_trees) print local_semantic_trees for semantic_tree in local_semantic_trees: unary_rules.extend(semantic_tree.get_unary_rules()) binary_rules.extend(semantic_tree.get_binary_rules()) tag_model = train_tag_model(syntax_parses, annotations) print "sentences: %d" % sum( len(question.sentence_words) for _, question in questions.iteritems()) print "words: %d" % (sum( len(words) for _, question in questions.iteritems() for _, words in question.sentence_words.iteritems())) print "literals: %d" % len(semantic_trees) print "unary rules: %d" % len(unary_rules) print "binary rules: %d" % len(binary_rules) print "" print "LEXICON" for key, s in tag_model.lexicon.iteritems(): print "%s: %s" % ("_".join(key), ", ".join(" ".join(ss) for ss in s))
def full_test(): start = time.time() ids1 = [963, 968, 969, 971, 973, 974, 977, 985, 990, 993, 995, 1000, 1003, 1004, 1006, 1014, 1017, 1018, 1020,] #1011 ids2 = [1025, 1030, 1031, 1032, 1035, 1038, 1039, 1040, 1042, 1043, 1045, 1047, 1050, 1051, 1052, 1054, 1056, 1058,] #1027, 1037 ids3 = [1063, 1065, 1067, 1076, 1089, 1095, 1096, 1097, 1099, 1102, 1105, 1106, 1107, 1108, 1110, 1111, 1119, 1120, 1121] # 1103 ids4 = [1122, 1123, 1124, 1127, 1141, 1142, 1143, 1145, 1146, 1147, 1149, 1150, 1151, 1152, 1070, 1083, 1090, 1092, 1144, 1148] ids5 = [975, 979, 981, 988, 989, 997, 1005, 1019, 1029, 1044, 1046, 1057, 1059, 1064, 1087, 1104, 1113, 1114, 1129, 1071] ids6 = [1100, 1101, 1109, 1140, 1053] tr_ids = ids4+ids5+ids6 te_ids = ids1+ids2+ids3 te_ids = ids4+ids6 load = True tr_questions = geoserver_interface.download_questions('aaai') te_questions = geoserver_interface.download_questions('emnlp') te_keys = [968, 971, 973, 1018] all_questions = dict(tr_questions.items() + te_questions.items()) tr_ids = tr_questions.keys() te_ids = te_questions.keys() if not load: all_syntax_parses = questions_to_syntax_parses(all_questions) pickle.dump(all_syntax_parses, open('syntax_parses.p', 'wb')) else: all_syntax_parses = pickle.load(open('syntax_parses.p', 'rb')) all_annotations = geoserver_interface.download_semantics() all_labels = geoserver_interface.download_labels() correct = 0 penalized = 0 error = 0 total = len(te_keys) #(te_s, te_a, te_l), (tr_s, tr_a, trl_l) = split([all_syntax_parses, all_annotations, all_labels], 0.7) tr_s = {id_: all_syntax_parses[id_] for id_ in tr_ids} tr_a = {id_: all_annotations[id_] for id_ in tr_ids} te_s = {id_: all_syntax_parses[id_] for id_ in te_ids} if not load: tm = train_tag_model(all_syntax_parses, all_annotations) cm = train_semantic_model(tm, tr_s, tr_a) pickle.dump(cm, open('cm.p', 'wb')) else: cm = pickle.load(open('cm.p', 'rb')) print "test ids: %s" % ", ".join(str(k) for k in te_s.keys()) for idx, id_ in enumerate(te_keys): question = all_questions[id_] label = all_labels[id_] id_ = str(id_) print "-"*80 print "id: %s" % id_ result = full_unit_test(cm, question, label) print result.message print result if result.error: error += 1 if result.penalized: penalized += 1 if result.correct: correct += 1 print "-"*80 print "%d/%d complete, %d correct, %d penalized, %d error" % (idx+1, len(te_keys), correct, penalized, error) end = time.time() print "-"*80 print "duration:\t%.1f" % (end - start) out = "total:\t\t%d\npenalized:\t%d\ncorrect:\t%d\nerror:\t\t%d" % (total, penalized, correct, error) print out dirs_path = os.path.join(demo_path, 'dirs.json') json.dump([str(x) for x in te_keys], open(dirs_path, 'wb'))
def full_test(): start = time.time() ids1 = [ 963, 968, 969, 971, 973, 974, 977, 985, 990, 993, 995, 1000, 1003, 1004, 1006, 1014, 1017, 1018, 1020, ] #1011 ids2 = [ 1025, 1030, 1031, 1032, 1035, 1038, 1039, 1040, 1042, 1043, 1045, 1047, 1050, 1051, 1052, 1054, 1056, 1058, ] #1027, 1037 ids3 = [ 1063, 1065, 1067, 1076, 1089, 1095, 1096, 1097, 1099, 1102, 1105, 1106, 1107, 1108, 1110, 1111, 1119, 1120, 1121 ] # 1103 ids4 = [ 1122, 1123, 1124, 1127, 1141, 1142, 1143, 1145, 1146, 1147, 1149, 1150, 1151, 1152, 1070, 1083, 1090, 1092, 1144, 1148 ] ids5 = [ 975, 979, 981, 988, 989, 997, 1005, 1019, 1029, 1044, 1046, 1057, 1059, 1064, 1087, 1104, 1113, 1114, 1129, 1071 ] ids6 = [1100, 1101, 1109, 1140, 1053] tr_ids = ids4 + ids5 + ids6 te_ids = ids1 + ids2 + ids3 te_ids = ids4 + ids6 load = False tr_questions = geoserver_interface.download_questions('aaai') te_questions = geoserver_interface.download_questions('official') te_keys = te_questions.keys() # [968, 971, 973, 1018] all_questions = dict(tr_questions.items() + te_questions.items()) tr_ids = tr_questions.keys() te_ids = te_questions.keys() if not load: all_syntax_parses = questions_to_syntax_parses(all_questions) pickle.dump(all_syntax_parses, open('syntax_parses.p', 'wb')) else: all_syntax_parses = pickle.load(open('syntax_parses.p', 'rb')) all_annotations = geoserver_interface.download_semantics() all_labels = geoserver_interface.download_labels() correct = 0 penalized = 0 error = 0 total = len(te_keys) #(te_s, te_a, te_l), (tr_s, tr_a, trl_l) = split([all_syntax_parses, all_annotations, all_labels], 0.7) tr_s = {id_: all_syntax_parses[id_] for id_ in tr_ids} tr_a = {id_: all_annotations[id_] for id_ in tr_ids} te_s = {id_: all_syntax_parses[id_] for id_ in te_ids} if not load: tm = train_tag_model(all_syntax_parses, all_annotations) cm = train_semantic_model(tm, tr_s, tr_a) pickle.dump(cm, open('cm.p', 'wb')) else: cm = pickle.load(open('cm.p', 'rb')) print "test ids: %s" % ", ".join(str(k) for k in te_s.keys()) for idx, id_ in enumerate(te_keys): question = all_questions[id_] label = all_labels[id_] id_ = str(id_) print "-" * 80 print "id: %s" % id_ result = full_unit_test(cm, question, label) print result.message print result if result.error: error += 1 if result.penalized: penalized += 1 if result.correct: correct += 1 print "-" * 80 print "%d/%d complete, %d correct, %d penalized, %d error" % ( idx + 1, len(te_keys), correct, penalized, error) end = time.time() print "-" * 80 print "duration:\t%.1f" % (end - start) out = "total:\t\t%d\npenalized:\t%d\ncorrect:\t%d\nerror:\t\t%d" % ( total, penalized, correct, error) print out dirs_path = os.path.join(demo_path, 'dirs.json') json.dump([str(x) for x in te_keys], open(dirs_path, 'wb'))