def predict_sentence(): train_file = 'swedish_talbanken05_train.conll' test_file = 'swedish_talbanken05_test_blind.conll' column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] sentences = conll.read_sentences(test_file) formatted_corpus = conll.split_rows(sentences, column_names_2006_test) features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra'] features2 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'can_re', 'can_ra' ] features3 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1', 'can_re', 'can_ra' ] sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: feat = features.extract_3(stack, queue, graph, features3, sentence) # print(feat) feat = vec.transform(feat) trans_nr = model.predict(feat) # print(trans_nr) trans = label.inverse_transform(trans_nr) print(trans) # fel Graph stack, queue, graph, trans = parse_ml(stack, queue, graph, trans[0]) stack, graph = transition.empty_stack(stack, graph) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] conll.save("test", formatted_corpus, column_names_2006)
for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] return X if __name__ == '__main__': test_file = 'swedish_talbanken05_test_blind.conll' sentences = conll.read_sentences(test_file) column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] formatted_corpus = conll.split_rows(sentences, column_names_2006) feature_names = [ 'stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word', 'can-re', 'can-la', 'before_word', 'before_POS', 'after_word', 'after_POS' ] classifier = pickle.load(open('model3.pkl', 'rb')) #print(classifier) dict_classes = pickle.load(open('dict_classes3.pkl', 'rb')) vec = pickle.load(open('vec3.pkl', 'rb')) #print(vec) X_dict, y_dict = extract_features(formatted_corpus, feature_names, classifier, dict_classes, vec) conll.save("parsedTestSentences3", formatted_corpus, column_names_2006)
def extract_features(formatted_corpus, mode, test_mode=False, vec=None, classifier=None): # EXTRACT FEATURES feature_names_1 = [ 'stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word', 'can-re', 'can-la' ] feature_names_2 = [ 'stack1_POS', 'stack1_word', 'queue1_POS', 'queue1_word' ] feature_names_3 = ['left_POS', 'left_word', 'right_POS', 'right_word'] feature_names = { 'mode1': feature_names_1, 'mode2': feature_names_2, 'mode3': feature_names_3 } X = list() transitions = list() sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 # if sent_cnt % 1000 == 0: # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' while queue: if mode == 3: X_row = extract_mode_3(stack, queue, graph, feature_names, sentence) elif mode == 2: X_row = extract_mode_2(stack, queue, graph, feature_names, sentence) elif mode == 1: X_row = extract_mode_1(stack, queue, graph, feature_names, sentence) if not test_mode: stack, queue, graph, trans = reference(stack, queue, graph) elif test_mode: X_row_vec = vec.transform(X_row) trans_nr = classifier.predict(X_row_vec) stack, queue, graph, trans = parse_ml(stack, queue, graph, trans_nr) X.append(X_row) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) # print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. if test_mode: for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] # print(graph) for pos, e in enumerate(X[:6]): print("x = {}, y= {}".format(e, transitions[pos])) # print(X) # print(transitions) if test_mode: conll.save('out_{}_mode_{}.conll'.format("test", mode), formatted_corpus, column_names_2006) return X, transitions
# Build new graph stack, queue, graph, trans = execute_transition( stack, queue, graph, predicted_trans) # Save the predicted trans y_predicted_symbols.append(trans) stack, graph = transition.empty_stack(stack, graph) for word in sentence: word_id = word['id'] try: word['head'] = graph['heads'][word_id] word['phead'] = graph['heads'][word_id] except KeyError: word['head'] = '_' word['phead'] = '_' try: word['deprel'] = graph['deprels'][word_id] word['pdeprel'] = graph['deprels'][word_id] except KeyError: word['deprel'] = '_' word['pdeprel'] = '_' conll.save('results.txt', formatted_corpus, column_names_2006) # print("Classification report for classifier %s:\n%s\n" # % (classifier, metrics.classification_report(y_train_symbols, list(map(lambda y_pred: dict_classes[y_pred], y_predicted_symbols)) )))
for sentence in formatted_corpus_test: queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' word_names = ["form", "trans"] stack = [] while queue: y_predict = [] x_predict = features.extract(stack, queue, graph, feature_names, sentence) x_predict_fitted = vec.transform( (dict(zip(feature_names, x_predict)))) y_predict = classifier.predict(x_predict_fitted) stack, queue, graph, trans = parse_ml(stack, queue, graph, y_predict[-1]) #print(transition.equal_graphs(words,graph)) stack, graph = transition.empty_stack(stack, graph) for word in sentence: #print(words['id']) word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word["id"]] conll.save("testf3.txt", formatted_corpus_test, column_names_2006)
test_sentences = conll.read_sentences(test_corpus) # Take the sentences and break them up into a useful format # Each corpus is a list of lists of dictionaries # Corpus = dictionary, each sentence is a list, each word is a dictionary split_training_sentences = conll.split_rows(training_sentences, training_column_names) split_test_sentences = conll.split_rows(test_sentences, training_column_names) i = 1 for feature_set in feature_sets: vec = DictVectorizer(sparse=True) training_setup(split_training_sentences, feature_set, vec, i) classifier_filename = 'ml_classifier' + str(i) + '.pickle' model_filename = 'ml_model' + str(i) + '.pickle' classifier_file = open(classifier_filename, 'rb') model_file = open(model_filename, 'rb') classifier = pickle.load(classifier_file) model = pickle.load(model_file) output_filename = 'ml_classification_output' + str(i) + '.out' perform_prediction(split_test_sentences, feature_set, classifier, model, i) conll.save(output_filename, split_test_sentences, training_column_names) print('Finished the Prediction of the corpus with Classifier/Model ', i) i += 1
(vec, model) = pickle.load(open('training.pkl', 'rb')) for sentence in formatted_corpus_test: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: X_vec = feats.extract3(stack, queue, graph, third_set, sentence) # X.append(X_vec) X_test = vec.transform(X_vec) trans = model.predict(X_test)[0] y_test_pred.append(trans) stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) transitions.append(trans) all_transitions.extend(transitions) stack, graph = transition.empty_stack(stack, graph) # print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] conll.save('out_file', formatted_corpus_test, column_names_2006)