def extract_features_sent(sentence, feature_names, classifier, dict_classes, vec): stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] x = list() X = list() y = list() while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) x.append(stack[0]['form']) else: x.append('nil') x.append('nil') if (queue): x.append(queue[0]['cpostag']) x.append(queue[0]['form']) else: x.append('nil') x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X = (dict(zip(feature_names, x))) #remove reference, predict what action should be done(equiv to trans) #print('Stack is ', len(stack)) #print('Queue is ', queue) trans_nr = classifier.predict(vec.transform(X)) print(trans_nr[0]) trans = dict_classes[trans_nr[0]] stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) x = list() #stack, graph = transition.empty_stack(stack, graph) transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] return graph
def extract_all_features(formatted_corpus): sent_cnt = 0 y_symbols = [] # Our array of transistions X_dict = list() # Our matrix for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: a = 1 # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' while queue: x = features.extract(stack, queue, graph, FEATURE_NAMES, sentence) X_dict.append(x) stack, queue, graph, trans = reference(stack, queue, graph) y_symbols.append(trans) stack, graph = transition.empty_stack(stack, graph) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] # print(y_symbols) # print(graph) return X_dict, y_symbols
def predict_sentence(): train_file = 'swedish_talbanken05_train.conll' test_file = 'swedish_talbanken05_test_blind.conll' column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] sentences = conll.read_sentences(test_file) formatted_corpus = conll.split_rows(sentences, column_names_2006_test) features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra'] features2 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'can_re', 'can_ra' ] features3 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1', 'can_re', 'can_ra' ] sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: feat = features.extract_3(stack, queue, graph, features3, sentence) # print(feat) feat = vec.transform(feat) trans_nr = model.predict(feat) # print(trans_nr) trans = label.inverse_transform(trans_nr) print(trans) # fel Graph stack, queue, graph, trans = parse_ml(stack, queue, graph, trans[0]) stack, graph = transition.empty_stack(stack, graph) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] conll.save("test", formatted_corpus, column_names_2006)
def train_model(): train_file = 'swedish_talbanken05_train.conll' test_file = 'swedish_talbanken05_test_blind.conll' column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] sentences = conll.read_sentences(train_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra'] features2 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'can_re', 'can_ra' ] features3 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1', 'can_re', 'can_ra' ] sent_cnt = 0 x_vect = [] y_vect = [] for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: feat = features.extract_3(stack, queue, graph, features3, sentence) stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) x_vect.append(feat) y_vect.append(trans) # print(feat, " = ", trans) stack, graph = transition.empty_stack(stack, graph) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] # print(transitions) # print(graph) return x_vect, y_vect
def extract_features(formatted_corpus, feature_names, training=True, model=None): non_proj = [] X_1 = [] y_1 = [] sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] feats = [] while queue: feats.append(features.extract(stack, queue, graph, feature_names, sentence)) stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) X_1.extend(feats) y_1.extend(transitions) #print('Equal graphs:', transition.equal_graphs(sentence, graph)) if not transition.equal_graphs(sentence, graph): non_proj.append(sentence) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] #print(transitions) #print(graph) #print(len(non_proj)) #s = sorted(non_proj, key=lambda x: len(x)) #print([x['form'] for x in s[0]]) #for x in non_proj: # print(len(x)) # print(x) return (X_1, y_1)
def calculateSomething(filen, model=None, dict_vect=None, label_enc = None): column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] sentences = conll.read_sentences(filen) formatted_corpus = conll.split_rows(sentences, column_names_2006) sent_cnt = 0 X_unEncoded = [] y_unEncoded = [] for sentence in formatted_corpus: sent_cnt += 1 #if sent_cnt % 1000 == 0: # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) state = {} state['heads'] = {} state['heads']['0'] = '0' state['deprels'] = {} state['deprels']['0'] = 'ROOT' transitions = [] while queue: featureRow = extract(stack, queue, state, [], sentence) if model is None or dict_vect is None or label_enc is None: stack, queue, state, trans = reference(stack, queue, state) transitions.append(trans) else: featureRow_encoded = dict_vect.transform(featureRow) trans_nr = model.predict(featureRow_encoded) trans = le.inverse_transform(trans_nr) print(trans[0]) stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) X_unEncoded.append(featureRow) y_unEncoded.append(trans) stack, state = transition.empty_stack(stack, state) #print('Equal graphs:', transition.equal_graphs(sentence, state)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = state['heads'][word['id']] return X_unEncoded, y_unEncoded
def extract_features(formatted_corpus, feature_names): X, Y = [], [] for sentence in formatted_corpus: stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' temp_X = [] temp_Y = [] while queue: # x is one row of X x = extract(stack, queue, graph, feature_names, sentence) stack, queue, graph, trans = reference(stack, queue, graph) temp_X.append(x) temp_Y.append(trans) stack, graph = transition.empty_stack(stack, graph) if transition.equal_graphs(sentence, graph): X += temp_X Y += temp_Y return X, Y
def extract_features_sent(sentence, feature_names, classifier, dict_classes, vec): stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' x = list() X = list() d = len(sentence) while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['cpostag']) else: x.append('nil') if (len(stack) > 0): x.append(stack[0]['form']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['form']) else: x.append('nil') if (queue): x.append(queue[0]['cpostag']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['cpostag']) else: x.append('nil') if (queue): x.append(queue[0]['form']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['form']) else: x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X = (dict(zip(feature_names, x))) trans_nr = classifier.predict(vec.transform(X))[0] trans = dict_classes[trans_nr] stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) x = list() transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] return X
column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] sentences = conll.read_sentences(train_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] print(transitions) print(graph)
def extract_features(formatted_corpus, mode, test_mode=False, vec=None, classifier=None): # EXTRACT FEATURES feature_names_1 = [ 'stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word', 'can-re', 'can-la' ] feature_names_2 = [ 'stack1_POS', 'stack1_word', 'queue1_POS', 'queue1_word' ] feature_names_3 = ['left_POS', 'left_word', 'right_POS', 'right_word'] feature_names = { 'mode1': feature_names_1, 'mode2': feature_names_2, 'mode3': feature_names_3 } X = list() transitions = list() sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 # if sent_cnt % 1000 == 0: # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' while queue: if mode == 3: X_row = extract_mode_3(stack, queue, graph, feature_names, sentence) elif mode == 2: X_row = extract_mode_2(stack, queue, graph, feature_names, sentence) elif mode == 1: X_row = extract_mode_1(stack, queue, graph, feature_names, sentence) if not test_mode: stack, queue, graph, trans = reference(stack, queue, graph) elif test_mode: X_row_vec = vec.transform(X_row) trans_nr = classifier.predict(X_row_vec) stack, queue, graph, trans = parse_ml(stack, queue, graph, trans_nr) X.append(X_row) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) # print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. if test_mode: for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] # print(graph) for pos, e in enumerate(X[:6]): print("x = {}, y= {}".format(e, transitions[pos])) # print(X) # print(transitions) if test_mode: conll.save('out_{}_mode_{}.conll'.format("test", mode), formatted_corpus, column_names_2006) return X, transitions
X = list() Y = list() for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) state = {} state['heads'] = {} state['heads']['0'] = '0' state['deprels'] = {} state['deprels']['0'] = 'ROOT' transitions = [] #features.extract(stack,queue,state,feature_names,sentence) #if sent_cnt<2: while queue: X.append(features.extract2(stack,queue,state,feature_names2,sentence)) stack, queue, state, trans = reference(stack, queue, state) transitions.append(trans) Y.append(trans) stack, state = transition.empty_stack(stack, state) #print('Equal graphs:', transition.equal_graphs(sentence, state)) # Poorman's projectivization to have well-formed graphs. #for word in sentence: # word['head'] = state['heads'][word['id']] #print(transitions) #print(state) print(X) #print(Y)
column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] sentences = conll.read_sentences(train_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] print(transitions) print(graph)