示例#1
0
def predict_sentence():
    train_file = 'swedish_talbanken05_train.conll'
    test_file = 'swedish_talbanken05_test_blind.conll'
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    column_names_2006_test = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats'
    ]

    sentences = conll.read_sentences(test_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006_test)
    features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra']
    features2 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'can_re', 'can_ra'
    ]
    features3 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1',
        'can_re', 'can_ra'
    ]

    sent_cnt = 0

    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:

            feat = features.extract_3(stack, queue, graph, features3, sentence)
            # print(feat)
            feat = vec.transform(feat)
            trans_nr = model.predict(feat)
            # print(trans_nr)
            trans = label.inverse_transform(trans_nr)
            print(trans)
            # fel Graph
            stack, queue, graph, trans = parse_ml(stack, queue, graph,
                                                  trans[0])

        stack, graph = transition.empty_stack(stack, graph)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
            word['deprel'] = graph['deprels'][word['id']]

    conll.save("test", formatted_corpus, column_names_2006)
示例#2
0
    for word in sentence:
        word['head'] = graph['heads'][word['id']]
        word['deprel'] = graph['deprels'][word['id']]
    return X


if __name__ == '__main__':
    test_file = 'swedish_talbanken05_test_blind.conll'
    sentences = conll.read_sentences(test_file)
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    column_names_2006_test = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats'
    ]
    formatted_corpus = conll.split_rows(sentences, column_names_2006)
    feature_names = [
        'stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS',
        'queue1_POS', 'queue0_word', 'queue1_word', 'can-re', 'can-la',
        'before_word', 'before_POS', 'after_word', 'after_POS'
    ]
    classifier = pickle.load(open('model3.pkl', 'rb'))
    #print(classifier)
    dict_classes = pickle.load(open('dict_classes3.pkl', 'rb'))
    vec = pickle.load(open('vec3.pkl', 'rb'))
    #print(vec)
    X_dict, y_dict = extract_features(formatted_corpus, feature_names,
                                      classifier, dict_classes, vec)
    conll.save("parsedTestSentences3", formatted_corpus, column_names_2006)
def extract_features(formatted_corpus,
                     mode,
                     test_mode=False,
                     vec=None,
                     classifier=None):
    # EXTRACT FEATURES
    feature_names_1 = [
        'stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word', 'can-re',
        'can-la'
    ]
    feature_names_2 = [
        'stack1_POS', 'stack1_word', 'queue1_POS', 'queue1_word'
    ]
    feature_names_3 = ['left_POS', 'left_word', 'right_POS', 'right_word']

    feature_names = {
        'mode1': feature_names_1,
        'mode2': feature_names_2,
        'mode3': feature_names_3
    }
    X = list()
    transitions = list()
    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        # if sent_cnt % 1000 == 0:
        #    print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'

        while queue:
            if mode == 3:
                X_row = extract_mode_3(stack, queue, graph, feature_names,
                                       sentence)
            elif mode == 2:
                X_row = extract_mode_2(stack, queue, graph, feature_names,
                                       sentence)
            elif mode == 1:
                X_row = extract_mode_1(stack, queue, graph, feature_names,
                                       sentence)
            if not test_mode:
                stack, queue, graph, trans = reference(stack, queue, graph)
            elif test_mode:
                X_row_vec = vec.transform(X_row)
                trans_nr = classifier.predict(X_row_vec)
                stack, queue, graph, trans = parse_ml(stack, queue, graph,
                                                      trans_nr)
            X.append(X_row)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        # print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        if test_mode:
            for word in sentence:
                word['head'] = graph['heads'][word['id']]
                word['deprel'] = graph['deprels'][word['id']]
        # print(graph)
    for pos, e in enumerate(X[:6]):
        print("x = {}, y= {}".format(e, transitions[pos]))
    # print(X)
    # print(transitions)
    if test_mode:
        conll.save('out_{}_mode_{}.conll'.format("test", mode),
                   formatted_corpus, column_names_2006)
    return X, transitions
示例#4
0
            # Build new graph
            stack, queue, graph, trans = execute_transition(
                stack, queue, graph, predicted_trans)

            # Save the predicted trans
            y_predicted_symbols.append(trans)

        stack, graph = transition.empty_stack(stack, graph)

        for word in sentence:
            word_id = word['id']
            try:
                word['head'] = graph['heads'][word_id]
                word['phead'] = graph['heads'][word_id]
            except KeyError:
                word['head'] = '_'
                word['phead'] = '_'

            try:
                word['deprel'] = graph['deprels'][word_id]
                word['pdeprel'] = graph['deprels'][word_id]
            except KeyError:
                word['deprel'] = '_'
                word['pdeprel'] = '_'

    conll.save('results.txt', formatted_corpus, column_names_2006)

    # print("Classification report for classifier %s:\n%s\n"
    # % (classifier, metrics.classification_report(y_train_symbols, list(map(lambda y_pred: dict_classes[y_pred], y_predicted_symbols)) )))
示例#5
0
    for sentence in formatted_corpus_test:
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        word_names = ["form", "trans"]
        stack = []
        while queue:
            y_predict = []
            x_predict = features.extract(stack, queue, graph, feature_names,
                                         sentence)
            x_predict_fitted = vec.transform(
                (dict(zip(feature_names, x_predict))))
            y_predict = classifier.predict(x_predict_fitted)
            stack, queue, graph, trans = parse_ml(stack, queue, graph,
                                                  y_predict[-1])

        #print(transition.equal_graphs(words,graph))

        stack, graph = transition.empty_stack(stack, graph)
        for word in sentence:

            #print(words['id'])

            word['head'] = graph['heads'][word['id']]
            word['deprel'] = graph['deprels'][word["id"]]

    conll.save("testf3.txt", formatted_corpus_test, column_names_2006)
示例#6
0
    test_sentences = conll.read_sentences(test_corpus)

    # Take the sentences and break them up into a useful format
    # Each corpus is a list of lists of dictionaries
    # Corpus = dictionary, each sentence is a list, each word is a dictionary
    split_training_sentences = conll.split_rows(training_sentences,
                                                training_column_names)
    split_test_sentences = conll.split_rows(test_sentences,
                                            training_column_names)

    i = 1
    for feature_set in feature_sets:
        vec = DictVectorizer(sparse=True)
        training_setup(split_training_sentences, feature_set, vec, i)

        classifier_filename = 'ml_classifier' + str(i) + '.pickle'
        model_filename = 'ml_model' + str(i) + '.pickle'
        classifier_file = open(classifier_filename, 'rb')
        model_file = open(model_filename, 'rb')
        classifier = pickle.load(classifier_file)
        model = pickle.load(model_file)

        output_filename = 'ml_classification_output' + str(i) + '.out'
        perform_prediction(split_test_sentences, feature_set, classifier,
                           model, i)
        conll.save(output_filename, split_test_sentences,
                   training_column_names)
        print('Finished the Prediction of the corpus with Classifier/Model ',
              i)
        i += 1
示例#7
0
    (vec, model) = pickle.load(open('training.pkl', 'rb'))
    for sentence in formatted_corpus_test:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []

        while queue:
            X_vec = feats.extract3(stack, queue, graph, third_set, sentence)
            # X.append(X_vec)
            X_test = vec.transform(X_vec)
            trans = model.predict(X_test)[0]
            y_test_pred.append(trans)
            stack, queue, graph, trans = parse_ml(stack, queue, graph, trans)
            transitions.append(trans)
        all_transitions.extend(transitions)
        stack, graph = transition.empty_stack(stack, graph)
        # print('Equal graphs:', transition.equal_graphs(sentence, graph))
        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
            word['deprel'] = graph['deprels'][word['id']]
    conll.save('out_file', formatted_corpus_test, column_names_2006)