def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: " + str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated #sentence.words_tags # """ # ==== comment 0 # hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean # golinear en iterator-functie iterloop (die uit depTree komt) # Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie # """ histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree)] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)] context_tags = [ sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree) ] for i, wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-'] + context_tags[0:i]) history_words = ['-START-'] + context_words[0:i] history_pos_tags = ['-POSTAGSTART-' ] + context_pos_tags[0:i] else: history_tags = context_tags[i - history:i] history_words = context_words[i - history:i] history_pos_tags = context_pos_tags[i - history:i] history_vectors = ('ph', [history_tags]) cur_idx = i prev_idx = cur_idx - 1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) else: for i, wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence, par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) history_vectors = ('ph', [history_tags]) cur_idx = dt.sen_idx(sentence.raw_sentence, wrd) for prev_idx, w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) #else: # prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) # hist_hist = [] # for tag in all_tags: # hist_hist.append( # dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance) # ) # histories.append(hist_hist) histories.append( (prev_idx, history_words, history_pos_tags, distance)) # """ # /==== end comment 0 # """ #print histories dict_target_feature_vectors = [ v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors ] pre_pros.append( (parsed_tree, dict_target_feature_vectors, histories)) #weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, # history, weight_matrix, context_words, context_pos_tags) except Exception as ex: print "error" pipeline.log('train', sentence) print 'pre_pros', time() - t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter", i cum_weights = (i) * weight_matrix for parsed_tree, dict_target_feature_vectors, histories in pre_pros: target_feature_vectors = [ d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors ] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix) / (i + 1) print "one iter: ", time() - iter_time print 'train', time() - t2 return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: "+str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree) ] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ] context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)] for i,wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-']+context_tags[0:i]) history_words = ['-START-']+context_words[0:i] history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i] else: history_tags = context_tags[i-history:i] history_words = context_words[i-history:i] history_pos_tags = context_pos_tags[i-history:i] history_vectors = ('ph', [history_tags] ) cur_idx = i prev_idx = cur_idx-1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) else: for i,wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence,par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) history_vectors = ('ph',[history_tags] ) cur_idx = dt.sen_idx(sentence.raw_sentence,wrd) for prev_idx,w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors] pre_pros.append((parsed_tree,dict_target_feature_vectors,histories)) except Exception as ex: print "error" pipeline.log('train',sentence) print 'pre_pros',time()-t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter",i cum_weights = (i)*weight_matrix for parsed_tree,dict_target_feature_vectors,histories in pre_pros: target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix)/(i+1) print "one iter: ", time() - iter_time print 'train',time()-t2 return weight_matrix
def flaws(dt, all_sentences, feature_dict, tbank, history, weight_matrix=None, with_tags=True): if weight_matrix is None: weight_matrix = sp.init_weights(len(feature_dict)) E = 0.0 counter_flaw = 1 for sentence in all_sentences: current_time = time() print "at flaws, sentence ", counter_flaw counter_flaw += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) histories = [] target_feature_vectors = [] if sp.golinear: #print parsed_tree context_words = [w.orth_ for w in dt.linear(parsed_tree)] context_pos_tags = [w.tag_ for w in dt.linear(parsed_tree)] context_tags = [ sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in dt.linear(parsed_tree) ] target_feature_vectors = [] for i, wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-'] + context_tags[0:i]) history_words = ['-START-'] + context_words[0:i] history_pos_tags = ['-POSTAGSTART-' ] + context_pos_tags[0:i] else: history_tags = context_tags[i - history:i] history_words = context_words[i - history:i] history_pos_tags = context_pos_tags[i - history:i] history_vectors = ('ph', [history_tags]) cur_idx = i prev_idx = cur_idx - 1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) else: context_tags = [ sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in dt.dfirst(parsed_tree) ] for i, wrd in enumerate(dt.dfirst(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence, par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) history_vectors = ('ph', [history_tags]) cur_idx = dt.sen_idx(sentence.raw_sentence, wrd) for prev_idx, w in enumerate(dt.dfirst(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) if not with_tags: context_tags = None E = sp.test_perceptron_once(E, parsed_tree, feature_dict, history, weight_matrix, histories, context_tags) except Exception as ex: log('flaw', sentence) print time() - current_time return E
def viterbi(parsed_tree, feature_dict, history, weight_matrix, histories): """ Input: The sentence to be tagged A list of all possible tags (strings) History: how far you want to look back Output: A list with all feature vectors, in order per word """ sentence_dict = {} # per word all possible tags no_tags = len(all_tags) # --------------------------- Viterbi forward path --------------------------- # t1=time() for i,wrd in enumerate(iterloop(parsed_tree) ): # now you know the position of the word in your sentence feature_vector_array = np.zeros((no_tags, SIZE) ) # now we assume we have only two features per tag (n.b. so this is not only correct or false, it's features) tag_score_array = np.zeros((no_tags)) history_list = [] t2=time() history_vectors = sentence_dict.get(histories[i][0],(0,0,[('-TAGSTART-',)]))[1:3] calc_feat = [None] for j,tag in enumerate(all_tags): t4=time() feature_vectors_tag = dp.construct_feature_vector(wrd.orth_, tag, feature_dict, histories[i][1], history, history_vectors, histories[i][2], histories[i][3], calc_feat) t4=time()-t4 t5=time() best_tag_score = -1e1000 # init scores --> delete once more clever list implementation with max best_feature_vector = np.zeros(SIZE) # number of features --> CHANGE history_word = ('Um') for tple in feature_vectors_tag: tag_score = np.dot(tple[0], weight_matrix.transpose()) if tag_score > best_tag_score: best_tag_score = tag_score best_feature_vector = tple[0] history_word = tple[1] t5=time()-t5 tag_score_array[j] = best_tag_score feature_vector_array[j,:] = best_feature_vector history_list.append(history_word) t2=time()-t2 sentence_dict[i] = (tag_score_array, feature_vector_array, history_list) t1=time()-t1 # --------------------------- Viterbi backward path --------------------------- # t6=time() final_feature_vectors = [] dict_len = len(sentence_dict) for entry in range(dict_len-1, -1, -1): (score, vector, history_list) = sentence_dict[entry] if entry == dict_len-1: high_score = score.argmax() best_vector = vector[high_score] history_best_vector = tag_idxes[history_list[high_score][-2]] else: best_vector = vector[history_best_vector] history_best_vector = tag_idxes[history_list[high_score][-2]] final_feature_vectors.append(best_vector) t6=time()-t6 return final_feature_vectors
def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: "+str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated #sentence.words_tags # """ # ==== comment 0 # hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean # golinear en iterator-functie iterloop (die uit depTree komt) # Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie # """ histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree) ] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ] context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)] for i,wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-']+context_tags[0:i]) history_words = ['-START-']+context_words[0:i] history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i] else: history_tags = context_tags[i-history:i] history_words = context_words[i-history:i] history_pos_tags = context_pos_tags[i-history:i] history_vectors = ('ph', [history_tags] ) cur_idx = i prev_idx = cur_idx-1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) else: for i,wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence,par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) history_vectors = ('ph',[history_tags] ) cur_idx = dt.sen_idx(sentence.raw_sentence,wrd) for prev_idx,w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) #else: # prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) # hist_hist = [] # for tag in all_tags: # hist_hist.append( # dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance) # ) # histories.append(hist_hist) histories.append((prev_idx,history_words,history_pos_tags,distance)) # """ # /==== end comment 0 # """ #print histories dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors] pre_pros.append((parsed_tree,dict_target_feature_vectors,histories)) #weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, # history, weight_matrix, context_words, context_pos_tags) except Exception as ex: print "error" pipeline.log('train',sentence) print 'pre_pros',time()-t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter",i cum_weights = (i)*weight_matrix for parsed_tree,dict_target_feature_vectors,histories in pre_pros: target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix)/(i+1) print "one iter: ", time() - iter_time print 'train',time()-t2 return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: " + str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree)] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)] context_tags = [ sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree) ] for i, wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-'] + context_tags[0:i]) history_words = ['-START-'] + context_words[0:i] history_pos_tags = ['-POSTAGSTART-' ] + context_pos_tags[0:i] else: history_tags = context_tags[i - history:i] history_words = context_words[i - history:i] history_pos_tags = context_pos_tags[i - history:i] history_vectors = ('ph', [history_tags]) cur_idx = i prev_idx = cur_idx - 1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) else: for i, wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence, par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) history_vectors = ('ph', [history_tags]) cur_idx = dt.sen_idx(sentence.raw_sentence, wrd) for prev_idx, w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) dict_target_feature_vectors = [ v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors ] pre_pros.append( (parsed_tree, dict_target_feature_vectors, histories)) except Exception as ex: print "error" pipeline.log('train', sentence) print 'pre_pros', time() - t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter", i cum_weights = (i) * weight_matrix for parsed_tree, dict_target_feature_vectors, histories in pre_pros: target_feature_vectors = [ d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors ] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix) / (i + 1) print "one iter: ", time() - iter_time print 'train', time() - t2 return weight_matrix
def viterbi(parsed_tree, feature_dict, history, weight_matrix, histories): """ Input: The sentence to be tagged A list of all possible tags (strings) History: how far you want to look back Output: A list with all feature vectors, in order per word """ sentence_dict = {} # per word all possible tags no_tags = len(all_tags) # --------------------------- Viterbi forward path --------------------------- # t1 = time() for i, wrd in enumerate( iterloop(parsed_tree) ): # now you know the position of the word in your sentence feature_vector_array = np.zeros( (no_tags, SIZE) ) # now we assume we have only two features per tag (n.b. so this is not only correct or false, it's features) tag_score_array = np.zeros((no_tags)) history_list = [] t2 = time() history_vectors = sentence_dict.get(histories[i][0], (0, 0, [('-TAGSTART-', )]))[1:3] calc_feat = [None] for j, tag in enumerate(all_tags): t4 = time() feature_vectors_tag = dp.construct_feature_vector( wrd.orth_, tag, feature_dict, histories[i][1], history, history_vectors, histories[i][2], histories[i][3], calc_feat) t4 = time() - t4 t5 = time() best_tag_score = -1e1000 # init scores --> delete once more clever list implementation with max best_feature_vector = np.zeros( SIZE) # number of features --> CHANGE history_word = ('Um') for tple in feature_vectors_tag: tag_score = np.dot(tple[0], weight_matrix.transpose()) if tag_score > best_tag_score: best_tag_score = tag_score best_feature_vector = tple[0] history_word = tple[1] t5 = time() - t5 tag_score_array[j] = best_tag_score feature_vector_array[j, :] = best_feature_vector history_list.append(history_word) t2 = time() - t2 sentence_dict[i] = (tag_score_array, feature_vector_array, history_list) t1 = time() - t1 # --------------------------- Viterbi backward path --------------------------- # t6 = time() final_feature_vectors = [] dict_len = len(sentence_dict) for entry in range(dict_len - 1, -1, -1): (score, vector, history_list) = sentence_dict[entry] if entry == dict_len - 1: high_score = score.argmax() best_vector = vector[high_score] history_best_vector = tag_idxes[history_list[high_score][-2]] else: best_vector = vector[history_best_vector] history_best_vector = tag_idxes[history_list[high_score][-2]] final_feature_vectors.append(best_vector) t6 = time() - t6 return final_feature_vectors
def flaws(dt,all_sentences,feature_dict,tbank,history,weight_matrix=None,with_tags=True): if weight_matrix is None: weight_matrix = sp.init_weights(len(feature_dict)) E = 0.0 counter_flaw = 1 for sentence in all_sentences: current_time = time() print "at flaws, sentence ",counter_flaw counter_flaw += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) histories = [] target_feature_vectors = [] if sp.golinear: #print parsed_tree context_words = [w.orth_ for w in dt.linear(parsed_tree) ] context_pos_tags = [w.tag_ for w in dt.linear(parsed_tree) ] context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in dt.linear(parsed_tree)] target_feature_vectors = [] for i,wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-']+context_tags[0:i]) history_words = ['-START-']+context_words[0:i] history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i] else: history_tags = context_tags[i-history:i] history_words = context_words[i-history:i] history_pos_tags = context_pos_tags[i-history:i] history_vectors = ('ph', [history_tags] ) cur_idx = i prev_idx = cur_idx-1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) else: context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in dt.dfirst(parsed_tree)] for i,wrd in enumerate(dt.dfirst(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence,par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) history_vectors = ('ph',[history_tags] ) cur_idx = dt.sen_idx(sentence.raw_sentence,wrd) for prev_idx,w in enumerate(dt.dfirst(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) if not with_tags: context_tags = None E = sp.test_perceptron_once(E, parsed_tree, feature_dict, history, weight_matrix, histories, context_tags) except Exception as ex: log('flaw',sentence) print time() - current_time return E