Python train_tag_model примеры, geosolver.text.run_text.train_tag_model Python примеры использования

Пример #1

0

Показать файл

Файл: run.py Проект: Darriall/geosolver

def data_stat(query):
    questions = geoserver_interface.download_questions(query)
    syntax_parses = questions_to_syntax_parses(questions, parser=False)
    annotations = geoserver_interface.download_semantics(query)
    unary_rules = []
    binary_rules = []
    semantic_trees = []
    for pk, local_syntax_parses in syntax_parses.iteritems():
        print pk
        for number, syntax_parse in local_syntax_parses.iteritems():
            local_semantic_trees = [annotation_to_semantic_tree(syntax_parse, annotation)
                              for annotation in annotations[pk][number].values()]
            semantic_trees.extend(local_semantic_trees)
            print local_semantic_trees
            for semantic_tree in local_semantic_trees:
                unary_rules.extend(semantic_tree.get_unary_rules())
                binary_rules.extend(semantic_tree.get_binary_rules())

    tag_model = train_tag_model(syntax_parses, annotations)

    print "sentences: %d" % sum(len(question.sentence_words) for _, question in questions.iteritems())
    print "words: %d" % (sum(len(words) for _, question in questions.iteritems() for _, words in question.sentence_words.iteritems()))
    print "literals: %d" % len(semantic_trees)
    print "unary rules: %d" % len(unary_rules)
    print "binary rules: %d" % len(binary_rules)

    print ""
    print "LEXICON"
    for key, s in tag_model.lexicon.iteritems():
        print "%s: %s" % ("_".join(key), ", ".join(" ".join(ss) for ss in s))

Пример #2

0

Показать файл

def data_stat(query):
    questions = geoserver_interface.download_questions(query)
    syntax_parses = questions_to_syntax_parses(questions, parser=False)
    annotations = geoserver_interface.download_semantics(query)
    unary_rules = []
    binary_rules = []
    semantic_trees = []
    for pk, local_syntax_parses in syntax_parses.iteritems():
        print pk
        for number, syntax_parse in local_syntax_parses.iteritems():
            local_semantic_trees = [
                annotation_to_semantic_tree(syntax_parse, annotation)
                for annotation in annotations[pk][number].values()
            ]
            semantic_trees.extend(local_semantic_trees)
            print local_semantic_trees
            for semantic_tree in local_semantic_trees:
                unary_rules.extend(semantic_tree.get_unary_rules())
                binary_rules.extend(semantic_tree.get_binary_rules())

    tag_model = train_tag_model(syntax_parses, annotations)

    print "sentences: %d" % sum(
        len(question.sentence_words) for _, question in questions.iteritems())
    print "words: %d" % (sum(
        len(words) for _, question in questions.iteritems()
        for _, words in question.sentence_words.iteritems()))
    print "literals: %d" % len(semantic_trees)
    print "unary rules: %d" % len(unary_rules)
    print "binary rules: %d" % len(binary_rules)

    print ""
    print "LEXICON"
    for key, s in tag_model.lexicon.iteritems():
        print "%s: %s" % ("_".join(key), ", ".join(" ".join(ss) for ss in s))

Пример #3

0

Показать файл

Файл: run.py Проект: Darriall/geosolver

def full_test():
    start = time.time()
    ids1 = [963, 968, 969, 971, 973, 974, 977, 985, 990, 993, 995, 1000, 1003, 1004, 1006, 1014, 1017, 1018, 1020,] #1011
    ids2 = [1025, 1030, 1031, 1032, 1035, 1038, 1039, 1040, 1042, 1043, 1045, 1047, 1050, 1051, 1052, 1054, 1056, 1058,] #1027, 1037
    ids3 = [1063, 1065, 1067, 1076, 1089, 1095, 1096, 1097, 1099, 1102, 1105, 1106, 1107, 1108, 1110, 1111, 1119, 1120, 1121] # 1103
    ids4 = [1122, 1123, 1124, 1127, 1141, 1142, 1143, 1145, 1146, 1147, 1149, 1150, 1151, 1152, 1070, 1083, 1090, 1092, 1144, 1148]
    ids5 = [975, 979, 981, 988, 989, 997, 1005, 1019, 1029, 1044, 1046, 1057, 1059, 1064, 1087, 1104, 1113, 1114, 1129, 1071]
    ids6 = [1100, 1101, 1109, 1140, 1053]
    tr_ids = ids4+ids5+ids6
    te_ids = ids1+ids2+ids3
    te_ids = ids4+ids6

    load = True

    tr_questions = geoserver_interface.download_questions('aaai')
    te_questions = geoserver_interface.download_questions('emnlp')
    te_keys = [968, 971, 973, 1018]
    all_questions = dict(tr_questions.items() + te_questions.items())
    tr_ids = tr_questions.keys()
    te_ids = te_questions.keys()

    if not load:
        all_syntax_parses = questions_to_syntax_parses(all_questions)
        pickle.dump(all_syntax_parses, open('syntax_parses.p', 'wb'))
    else:
        all_syntax_parses = pickle.load(open('syntax_parses.p', 'rb'))
    all_annotations = geoserver_interface.download_semantics()
    all_labels = geoserver_interface.download_labels()

    correct = 0
    penalized = 0
    error = 0
    total = len(te_keys)

    #(te_s, te_a, te_l), (tr_s, tr_a, trl_l) = split([all_syntax_parses, all_annotations, all_labels], 0.7)
    tr_s = {id_: all_syntax_parses[id_] for id_ in tr_ids}
    tr_a = {id_: all_annotations[id_] for id_ in tr_ids}
    te_s = {id_: all_syntax_parses[id_] for id_ in te_ids}

    if not load:
        tm = train_tag_model(all_syntax_parses, all_annotations)
        cm = train_semantic_model(tm, tr_s, tr_a)
        pickle.dump(cm, open('cm.p', 'wb'))
    else:
        cm = pickle.load(open('cm.p', 'rb'))

    print "test ids: %s" % ", ".join(str(k) for k in te_s.keys())
    for idx, id_ in enumerate(te_keys):
        question = all_questions[id_]
        label = all_labels[id_]
        id_ = str(id_)
        print "-"*80
        print "id: %s" % id_
        result = full_unit_test(cm, question, label)
        print result.message
        print result
        if result.error:
            error += 1
        if result.penalized:
            penalized += 1
        if result.correct:
            correct += 1
        print "-"*80
        print "%d/%d complete, %d correct, %d penalized, %d error" % (idx+1, len(te_keys), correct, penalized, error)
    end = time.time()
    print "-"*80
    print "duration:\t%.1f" % (end - start)

    out = "total:\t\t%d\npenalized:\t%d\ncorrect:\t%d\nerror:\t\t%d" % (total, penalized, correct, error)
    print out

    dirs_path = os.path.join(demo_path, 'dirs.json')
    json.dump([str(x) for x in te_keys], open(dirs_path, 'wb'))

Пример #4

0

Показать файл

def full_test():
    start = time.time()
    ids1 = [
        963,
        968,
        969,
        971,
        973,
        974,
        977,
        985,
        990,
        993,
        995,
        1000,
        1003,
        1004,
        1006,
        1014,
        1017,
        1018,
        1020,
    ]  #1011
    ids2 = [
        1025,
        1030,
        1031,
        1032,
        1035,
        1038,
        1039,
        1040,
        1042,
        1043,
        1045,
        1047,
        1050,
        1051,
        1052,
        1054,
        1056,
        1058,
    ]  #1027, 1037
    ids3 = [
        1063, 1065, 1067, 1076, 1089, 1095, 1096, 1097, 1099, 1102, 1105, 1106,
        1107, 1108, 1110, 1111, 1119, 1120, 1121
    ]  # 1103
    ids4 = [
        1122, 1123, 1124, 1127, 1141, 1142, 1143, 1145, 1146, 1147, 1149, 1150,
        1151, 1152, 1070, 1083, 1090, 1092, 1144, 1148
    ]
    ids5 = [
        975, 979, 981, 988, 989, 997, 1005, 1019, 1029, 1044, 1046, 1057, 1059,
        1064, 1087, 1104, 1113, 1114, 1129, 1071
    ]
    ids6 = [1100, 1101, 1109, 1140, 1053]
    tr_ids = ids4 + ids5 + ids6
    te_ids = ids1 + ids2 + ids3
    te_ids = ids4 + ids6

    load = False

    tr_questions = geoserver_interface.download_questions('aaai')
    te_questions = geoserver_interface.download_questions('official')
    te_keys = te_questions.keys()  # [968, 971, 973, 1018]
    all_questions = dict(tr_questions.items() + te_questions.items())
    tr_ids = tr_questions.keys()
    te_ids = te_questions.keys()

    if not load:
        all_syntax_parses = questions_to_syntax_parses(all_questions)
        pickle.dump(all_syntax_parses, open('syntax_parses.p', 'wb'))
    else:
        all_syntax_parses = pickle.load(open('syntax_parses.p', 'rb'))
    all_annotations = geoserver_interface.download_semantics()
    all_labels = geoserver_interface.download_labels()

    correct = 0
    penalized = 0
    error = 0
    total = len(te_keys)

    #(te_s, te_a, te_l), (tr_s, tr_a, trl_l) = split([all_syntax_parses, all_annotations, all_labels], 0.7)
    tr_s = {id_: all_syntax_parses[id_] for id_ in tr_ids}
    tr_a = {id_: all_annotations[id_] for id_ in tr_ids}
    te_s = {id_: all_syntax_parses[id_] for id_ in te_ids}

    if not load:
        tm = train_tag_model(all_syntax_parses, all_annotations)
        cm = train_semantic_model(tm, tr_s, tr_a)
        pickle.dump(cm, open('cm.p', 'wb'))
    else:
        cm = pickle.load(open('cm.p', 'rb'))

    print "test ids: %s" % ", ".join(str(k) for k in te_s.keys())
    for idx, id_ in enumerate(te_keys):
        question = all_questions[id_]
        label = all_labels[id_]
        id_ = str(id_)
        print "-" * 80
        print "id: %s" % id_
        result = full_unit_test(cm, question, label)
        print result.message
        print result
        if result.error:
            error += 1
        if result.penalized:
            penalized += 1
        if result.correct:
            correct += 1
        print "-" * 80
        print "%d/%d complete, %d correct, %d penalized, %d error" % (
            idx + 1, len(te_keys), correct, penalized, error)
    end = time.time()
    print "-" * 80
    print "duration:\t%.1f" % (end - start)

    out = "total:\t\t%d\npenalized:\t%d\ncorrect:\t%d\nerror:\t\t%d" % (
        total, penalized, correct, error)
    print out

    dirs_path = os.path.join(demo_path, 'dirs.json')
    json.dump([str(x) for x in te_keys], open(dirs_path, 'wb'))

Python train_tag_model примеры использования