def get_input_offset_token(self, token, input_sentence, input_offset): #CONFIRM THIS!!! if h.get_id(token) + input_offset <= 0: #To prevent wraparound return (None) return (self.try_get_token(input_sentence, -(h.get_id(token) + input_offset)) ) # No +1 since the token id starts from 1 instead of 0
def get_all_siblings(self, token, input_sentence, arcs): if arcs.get(h.get_id(token), None) is None: return([]) all_siblings = [] for word in input_sentence: if h.get_id(word) != 0 and h.get_id(word) != h.get_id(token) and arcs.get(h.get_id(word), None) is not None and arcs[h.get_id(word)] == arcs[h.get_id(token)]: all_siblings += [word] return(all_siblings)
def output(self, sentence): for token in sentence: head = self.arcs.get(h.get_id(token), '0') label = self.labels.get(h.get_id(token), '_') label = label if label is not None else '_' token[6] = str(head) token[7] = str(label) print '\t'.join(token) print
def get_all_children(self, token, input_sentence, arcs): all_children = {} head_id = h.get_id(token) for tail in arcs.keys(): if arcs[tail] == head_id: child_token = self.try_get_token(input_sentence, -(tail)) if child_token is not None: all_children[h.get_id(child_token)] = child_token else: print >>sys.stderr, 'Non-existent child, should NOT happen!!!' return(all_children)
def get_all_children(self, token, input_sentence, arcs): all_children = {} head_id = h.get_id(token) for tail in arcs.keys(): if arcs[tail] == head_id: child_token = self.try_get_token(input_sentence, -(tail)) if child_token is not None: all_children[h.get_id(child_token)] = child_token else: print >> sys.stderr, 'Non-existent child, should NOT happen!!!' return (all_children)
def get_head_offset_token(self, token, input_sentence, head_multiplier, arcs): while (token is not None and head_multiplier > 0): head_multiplier -= 1 token_id = h.get_id(token) head_id = arcs.get(token_id, None) if head_id == None: token = None else: token = self.try_get_token(input_sentence, -head_id) return(token)
def get_head_offset_token(self, token, input_sentence, head_multiplier, arcs): while (token is not None and head_multiplier > 0): head_multiplier -= 1 token_id = h.get_id(token) head_id = arcs.get(token_id, None) if head_id == None: token = None else: token = self.try_get_token(input_sentence, -head_id) return (token)
def execute_transition(self, transition): """This function should take a transition object and apply to the current parser state. It need not return anything.""" self.transitions.append(transition.transitionType) if (transition.transitionType == Transition.Shift): self.stack.append(self.buff.pop()) elif (transition.transitionType == Transition.LeftArc): top = self.stack.pop() top_id = h.get_id(top) pre_top = self.stack.pop() pre_top_id = h.get_id(pre_top) self.stack.append(top) self.arcs[pre_top_id] = top_id self.labels[pre_top_id] = transition.label else: top = self.stack.pop() top_id = h.get_id(top) pre_top = self.stack[-1] pre_top_id = h.get_id(pre_top) self.arcs[top_id] = pre_top_id self.labels[top_id] = transition.label
def execute_transition(self, transition): """This function should take a transition object and apply to the current parser state. It need not return anything.""" self.transitions.append(transition.transitionType) if (transition.transitionType == Transition.Shift): self.stack.append(self.buff.pop()) elif (transition.transitionType == Transition.LeftArc): top = self.stack.pop() top_id= h.get_id(top) pre_top = self.stack.pop() pre_top_id = h.get_id(pre_top) self.stack.append(top) self.arcs[pre_top_id] = top_id self.labels[pre_top_id] = transition.label else: top = self.stack.pop() top_id = h.get_id(top) pre_top = self.stack[-1] pre_top_id = h.get_id(pre_top) self.arcs[top_id] = pre_top_id self.labels[top_id] = transition.label
def getTransition(self, stack, buff, leftmostChildren, rightmostChildren, arcs, labeled): """This function should return a Transition object representing the correct action to to take according to the oracle.""" if len(stack) > 1: top = stack[-1] pre_top = stack[-2] rmc_top = rightmostChildren.get(p.get_id(top), -1) rmc_pre_top = rightmostChildren.get(p.get_id(pre_top), -1) lmc_top = leftmostChildren.get(p.get_id(top), p.INFINITY) lmc_pre_top = leftmostChildren.get(p.get_id(pre_top), p.INFINITY) if (p.get_head(pre_top) == p.get_id(top) and self.is_removable( pre_top, arcs, lmc_pre_top, rmc_pre_top)): if labeled: return (Transition(Transition.LeftArc, p.get_deprel(pre_top))) else: return (Transition(Transition.LeftArc, None)) elif (p.get_head(top) == p.get_id(pre_top) and self.is_removable(top, arcs, lmc_top, rmc_top)): if labeled: return (Transition(Transition.RightArc, p.get_deprel(top))) else: return (Transition(Transition.RightArc, None)) else: return (Transition(Transition.Shift, None)) else: if len(buff) >= 1: return (Transition(Transition.Shift, None)) else: return (None)
def getTransition(self, stack, buff, leftmostChildren, rightmostChildren, arcs, labeled): """This function should return a Transition object representing the correct action to to take according to the oracle.""" if len(stack) > 1: top = stack[-1] pre_top = stack[-2] rmc_top = rightmostChildren.get(p.get_id(top), -1) rmc_pre_top = rightmostChildren.get(p.get_id(pre_top), -1) lmc_top = leftmostChildren.get(p.get_id(top), p.INFINITY) lmc_pre_top = leftmostChildren.get(p.get_id(pre_top), p.INFINITY) if ( p.get_head(pre_top) == p.get_id(top) and self.is_removable(pre_top, arcs, lmc_pre_top, rmc_pre_top) ): if labeled: return(Transition(Transition.LeftArc, p.get_deprel(pre_top))) else: return(Transition(Transition.LeftArc, None)) elif ( p.get_head(top) == p.get_id(pre_top) and self.is_removable(top, arcs, lmc_top, rmc_top) ): if labeled: return(Transition(Transition.RightArc, p.get_deprel(top))) else: return(Transition(Transition.RightArc, None)) else: return(Transition(Transition.Shift, None)) else: if len(buff) >= 1: return(Transition(Transition.Shift, None)) else: return(None)
def get_all_siblings(self, token, input_sentence, arcs): if arcs.get(h.get_id(token), None) is None: return ([]) all_siblings = [] for word in input_sentence: if h.get_id(word) != 0 and h.get_id( word) != h.get_id(token) and arcs.get( h.get_id(word), None) is not None and arcs[h.get_id( word)] == arcs[h.get_id(token)]: all_siblings += [word] return (all_siblings)
def get_rightmost_child(self, token, input_sentence, arcs, rightmost_multiplier): assert rightmost_multiplier > 0, "Invalid rightmost_multiplier passed" while token is not None and rightmost_multiplier > 0: rightmost_multiplier -= 1 all_children = self.get_all_children(token, input_sentence, arcs) if len(all_children) == 0 : return(None) max_candidate = max(all_children.keys()) if (max_candidate > h.get_id(token)): token = all_children[max_candidate] else: return(None) return(token)
def get_leftmost_child(self, token, input_sentence, arcs, leftmost_multiplier): assert leftmost_multiplier < 0, "Invalid leftmost_multiplier passed" while token is not None and leftmost_multiplier < 0: leftmost_multiplier += 1 all_children = self.get_all_children(token, input_sentence, arcs) if len(all_children) == 0 : return(None) min_candidate = min(all_children.keys()) if (min_candidate < h.get_id(token)): token = all_children[min_candidate] else: return(None) return(token)
def get_rightmost_child(self, token, input_sentence, arcs, rightmost_multiplier): assert rightmost_multiplier > 0, "Invalid rightmost_multiplier passed" while token is not None and rightmost_multiplier > 0: rightmost_multiplier -= 1 all_children = self.get_all_children(token, input_sentence, arcs) if len(all_children) == 0: return (None) max_candidate = max(all_children.keys()) if (max_candidate > h.get_id(token)): token = all_children[max_candidate] else: return (None) return (token)
def get_leftmost_child(self, token, input_sentence, arcs, leftmost_multiplier): assert leftmost_multiplier < 0, "Invalid leftmost_multiplier passed" while token is not None and leftmost_multiplier < 0: leftmost_multiplier += 1 all_children = self.get_all_children(token, input_sentence, arcs) if len(all_children) == 0: return (None) min_candidate = min(all_children.keys()) if (min_candidate < h.get_id(token)): token = all_children[min_candidate] else: return (None) return (token)
def get_model7_params(self, stack, buff, input_sentence, arcs, labels, tType, feat_type, source_type, source_offset = 0, input_offset = 0, head_multiplier = 0, left_rightmost_multiplier = 0, left_right_sibling_specifier = 0, suffix_len = 0): #Described here: http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf assert feat_type in [self.DEP_FEAT, self.POS_FEAT, self.LEX_FEAT], "Invalid feat_type specified" assert source_type in [self.BUFF_SOURCE, self.STACK_SOURCE, self.INPUT_SOURCE], "Invalid source type specified" assert source_offset >= 0, "Invalid source_offset" assert head_multiplier >= 0, "Invalid head multiplier specified" rev_input_sentence = input_sentence[::-1] source = self.get_source(stack, buff, input_sentence, source_type)#Reverse the input sentence if it isn't already reversed by get_source token = self.try_get_token(source, -(source_offset + 1)) if token is None: return(None) if input_offset != 0: token = self.get_input_offset_token(token, rev_input_sentence, input_offset) if token is None: return(None) token = self.get_head_offset_token(token, rev_input_sentence, head_multiplier, arcs) if token is None: return(None) token = self.get_left_rightmost_child(token, rev_input_sentence, arcs, left_rightmost_multiplier) if token is None: return(None) token = self.get_left_right_sibling(token, rev_input_sentence, arcs, left_right_sibling_specifier) if token is None: return(None) ret_str = 'transition=%d,feat_type=%d,source_type=%d,source_offset=%d,input_offset=%d,head_multiplier=%d,left_rightmost_multiplier=%d,left_right_sibling_specifier=%d' % (tType, feat_type, source_type, source_offset, input_offset, head_multiplier , left_rightmost_multiplier, left_right_sibling_specifier) if feat_type == self.LEX_FEAT: #suffix len can be specified via argument lex_feat = h.get_word(token) if suffix_len > 0: ret_str += 'lex_feat=%s' %(lex_feat[-suffix_len:]) else: ret_str += 'lex_feat=%s' %(lex_feat) elif feat_type == self.DEP_FEAT: dep_feat = labels.get(h.get_id(token), None) if dep_feat is not None: ret_str += 'dep_feat=%s' %(dep_feat) else: return(None) elif feat_type == self.POS_FEAT: pos_feat = h.get_postag(token) ret_str += 'pos_feat=%s' % (pos_feat) else: return(None) return(ret_str)
def get_right_sibling(self, token, input_sentence, arcs, right_sibling_multiplier): assert right_sibling_multiplier > 0, "Invalid right sibling multiplier" all_siblings = self.get_all_siblings(token, input_sentence, arcs) while(token is not None and right_sibling_multiplier > 0): right_sibling_multiplier -= 1 if len(all_siblings) == 0: return(None) min_dist = h.INFINITY nearest_sibling = None for sibling in all_siblings: if h.get_id(sibling) > h.get_id(token) and abs(h.get_id(sibling) - h.get_id(token)) < min_dist: min_dist = abs(h.get_id(sibling) - h.get_id(token)) nearest_sibling = sibling token = nearest_sibling # NOTE: It is possible NOT that we keep cycling between siblings. return(token)
def get_right_sibling(self, token, input_sentence, arcs, right_sibling_multiplier): assert right_sibling_multiplier > 0, "Invalid right sibling multiplier" all_siblings = self.get_all_siblings(token, input_sentence, arcs) while (token is not None and right_sibling_multiplier > 0): right_sibling_multiplier -= 1 if len(all_siblings) == 0: return (None) min_dist = h.INFINITY nearest_sibling = None for sibling in all_siblings: if h.get_id(sibling) > h.get_id(token) and abs( h.get_id(sibling) - h.get_id(token)) < min_dist: min_dist = abs(h.get_id(sibling) - h.get_id(token)) nearest_sibling = sibling token = nearest_sibling # NOTE: It is possible NOT that we keep cycling between siblings. return (token)
def extract_features(self, transition, stack, buff, labels, previous_transitions, arcs, input_sentence): features = defaultdict(float) tType = transition.transitionType label = transition.label #Model7 features as described in http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf feat1_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 1) #dep for pre-top feat2_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 1, 0, 0, -1) #dep for pre-top's lmc feat21_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, -1) #pos for pre-top's lmc feat3_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 1, 0, 0, 1) #dep or pre-top's rmc feat31_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, 1) #pos or pre-top's rmc feat4_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 0, 0, 0, -1) #dep for top's lmc feat41_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0, 0, 0, -1) #pos for top's lmc feat5_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 1) #lex for pre-top feat6_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 0) #lex for top feat7_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.BUFF_SOURCE) #lex for next buffer item feat71_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.BUFF_SOURCE) #pos for next buffer item feat75_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.BUFF_SOURCE, 1) #lex for next-next buffer item feat76_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.BUFF_SOURCE, 1) #pos for next-next buffer item feat8_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 1, 1) #lex for word after pre-top in input feat9_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 1, -1) #lex for word before pre-top in input feat10_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, 1) #pos for word after pre-top in input feat11_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, -1) #pos for word before pre-top in input feat12_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 1, 1) #dep for word after pre-top in input feat13_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 1, -1) #dep for word before pre-top in input feat14_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 0, 1) #lex for word after top in input feat15_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 0, -1) #lex for word before top in input feat16_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0, 1) #pos for word after top in input feat17_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0, -1) #pos for word before top in input feat18_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 0, 1) #dep for word after top in input feat19_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 0, -1) #dep for word before top in input pre_top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1) top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0) cfeat_1 = self.compose_feats( features, [feat5_model7, pre_top_pos]) #lex_pos of pre-top cfeat_2 = self.compose_feats(features, [feat6_model7, top_pos]) #lex_pos for top cfeat_3 = self.compose_feats( features, [feat7_model7, feat71_model7]) #lex_pos for next buffer item cfeat_4 = self.compose_feats( features, [feat75_model7, feat76_model7]) #lex_pos for next-next buffer item cfeat_5 = self.compose_feats( features, [cfeat_1, cfeat_2]) #lex_pos for both pre-top and top cfeat_6 = self.compose_feats( features, [cfeat_1, feat6_model7]) #lex_pos of pre-top with lex of top cfeat_7 = self.compose_feats( features, [feat5_model7, cfeat_2]) #lex of pre-top with lex_pos of top cfeat_8 = self.compose_feats( features, [cfeat_1, top_pos]) #lex_pos of pre-top with pos of top cfeat_9 = self.compose_feats( features, [pre_top_pos, cfeat_2]) #pos of pre-top with lex_pos of top cfeat_10 = self.compose_feats( features, [feat5_model7, feat6_model7]) #lex of both pre_top and top cfeat_11 = self.compose_feats( features, [pre_top_pos, top_pos]) #pos of both pre_top and top cfeat_12 = self.compose_feats( features, [top_pos, feat71_model7]) #pos of top and next buff cfeat_13 = self.compose_feats(features, [top_pos, feat71_model7, feat76_model7 ]) #pos for top next and next next cfeat_14 = self.compose_feats(features, [pre_top_pos, top_pos, feat71_model7 ]) #pos for pre-top, top and next cfeat_15 = self.compose_feats(features, [pre_top_pos, feat21_model7, top_pos ]) #pos for pre-top pre top lmc and top cfeat_16 = self.compose_feats( features, [pre_top_pos, feat31_model7, top_pos ]) #pos for pre-top, pre-top rmc and top cfeat_17 = self.compose_feats(features, [pre_top_pos, top_pos, feat41_model7 ]) #pos for pre-top, top and top's lmc # Top two POS tags from the stack for i in range(3): #was originally 2 if i >= len(stack): break s = stack[-(i + 1)] pos = s[3] features['transition=%d,s%d.pos=%s' % (tType, i, pos)] = 1 # Next four POS tags from the buffer for i in range(3): if i >= len(buff): break b = buff[-(i + 1)] pos = b[3] features['transition=%d,b%d.pos=%s' % (tType, i, pos)] = 1 # Previous transition type if len(previous_transitions) > 0: prev = previous_transitions[-1] features['transition=%d,prev_transition=%d' % (tType, prev)] = 1 else: features['transition=%d,prev_transition=None' % (tType)] = 1 # Bias feature features['transition=%d' % (transition.transitionType)] = 1 if self.labeled: # Action and label pair features['transition=%d,label=%s' % (transition.transitionType, transition.label)] = 1 # Label bias features['label=%s' % (transition.label)] = 1 #Features based on http://dl.acm.org/citation.cfm?id=2002777 #Distance function if len(stack) > 0 and len(buff) > 0: dist = h.get_id(stack[-1]) - h.get_id(buff[-1]) if dist < 0: features['transition=%d,neg_dist=' % (tType)] = dist else: features['transition=%d,pos_dist=' % (tType)] = dist #Valency function if len(stack) > 1: if tType == Transition.LeftArc: # Left Arc [left_valency, right_valency] = self.get_valency(arcs, h.get_id(stack[-1])) features['transition=%d,head_left_valency=%d' % (tType, left_valency)] = 1 features['transition=%d,head_right_valency=%d' % (tType, right_valency)] = 1 elif tType == Transition.RightArc: #should probably check for right arc here! [left_valency, right_valency] = self.get_valency(arcs, h.get_id(stack[-2])) features['transition=%d,head_left_valency=%d' % (tType, left_valency)] = 1 features['transition=%d,head_right_valency=%d' % (tType, right_valency)] = 1 return features
def extract_features(self, transition, stack, buff, labels, previous_transitions, arcs, input_sentence): features = defaultdict(float) tType = transition.transitionType label = transition.label #Model7 features as described in http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf feat1_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1)#dep for pre-top feat2_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, 0, 0, -1) #dep for pre-top's lmc feat21_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, -1) #pos for pre-top's lmc feat3_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, 0, 0, 1)#dep or pre-top's rmc feat31_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, 1)#pos or pre-top's rmc feat4_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 0, 0, 0, -1)#dep for top's lmc feat41_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 0, 0, 0, -1)#pos for top's lmc feat5_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 1)#lex for pre-top feat6_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 0)#lex for top feat7_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.BUFF_SOURCE)#lex for next buffer item feat71_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.BUFF_SOURCE)#pos for next buffer item feat75_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.BUFF_SOURCE, 1)#lex for next-next buffer item feat76_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.BUFF_SOURCE, 1)#pos for next-next buffer item feat8_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 1, 1)#lex for word after pre-top in input feat9_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 1, -1)#lex for word before pre-top in input feat10_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, 1)#pos for word after pre-top in input feat11_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, -1)#pos for word before pre-top in input feat12_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, 1)#dep for word after pre-top in input feat13_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, -1)#dep for word before pre-top in input feat14_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 0, 1)#lex for word after top in input feat15_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 0, -1)#lex for word before top in input feat16_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 0, 1)#pos for word after top in input feat17_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 0, -1)#pos for word before top in input feat18_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 0, 1)#dep for word after top in input feat19_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 0, -1)#dep for word before top in input pre_top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1) top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0) cfeat_1 = self.compose_feats(features, [feat5_model7, pre_top_pos])#lex_pos of pre-top cfeat_2 = self.compose_feats(features, [feat6_model7, top_pos])#lex_pos for top cfeat_3 = self.compose_feats(features, [feat7_model7, feat71_model7])#lex_pos for next buffer item cfeat_4 = self.compose_feats(features, [feat75_model7,feat76_model7])#lex_pos for next-next buffer item cfeat_5 = self.compose_feats(features, [cfeat_1, cfeat_2])#lex_pos for both pre-top and top cfeat_6 = self.compose_feats(features, [cfeat_1, feat6_model7])#lex_pos of pre-top with lex of top cfeat_7 = self.compose_feats(features, [feat5_model7, cfeat_2])#lex of pre-top with lex_pos of top cfeat_8 = self.compose_feats(features, [cfeat_1, top_pos])#lex_pos of pre-top with pos of top cfeat_9 = self.compose_feats(features, [pre_top_pos, cfeat_2]) #pos of pre-top with lex_pos of top cfeat_10 = self.compose_feats(features, [feat5_model7, feat6_model7])#lex of both pre_top and top cfeat_11 = self.compose_feats(features, [pre_top_pos, top_pos])#pos of both pre_top and top cfeat_12 = self.compose_feats(features, [top_pos, feat71_model7])#pos of top and next buff cfeat_13 = self.compose_feats(features, [top_pos, feat71_model7, feat76_model7])#pos for top next and next next cfeat_14 = self.compose_feats(features, [pre_top_pos, top_pos, feat71_model7])#pos for pre-top, top and next cfeat_15 = self.compose_feats(features, [pre_top_pos, feat21_model7, top_pos])#pos for pre-top pre top lmc and top cfeat_16 = self.compose_feats(features, [pre_top_pos, feat31_model7, top_pos])#pos for pre-top, pre-top rmc and top cfeat_17 = self.compose_feats(features, [pre_top_pos, top_pos, feat41_model7])#pos for pre-top, top and top's lmc # Top two POS tags from the stack for i in range(3):#was originally 2 if i >= len(stack): break s = stack[-(i+1)] pos = s[3] features['transition=%d,s%d.pos=%s' % (tType, i, pos)] = 1 # Next four POS tags from the buffer for i in range(3): if i >= len(buff): break b = buff[-(i+1)] pos = b[3] features['transition=%d,b%d.pos=%s' % (tType, i, pos)] = 1 # Previous transition type if len(previous_transitions) > 0: prev = previous_transitions[-1] features['transition=%d,prev_transition=%d' % (tType, prev)] = 1 else: features['transition=%d,prev_transition=None' % (tType)] = 1 # Bias feature features['transition=%d' % (transition.transitionType)] = 1 if self.labeled: # Action and label pair features['transition=%d,label=%s' % (transition.transitionType, transition.label)] = 1 # Label bias features['label=%s' % (transition.label)] = 1 #Features based on http://dl.acm.org/citation.cfm?id=2002777 #Distance function if len(stack) > 0 and len(buff) > 0: dist = h.get_id(stack[-1]) - h.get_id(buff[-1]) if dist < 0: features['transition=%d,neg_dist=' % (tType)] = dist else: features['transition=%d,pos_dist=' % (tType)] = dist #Valency function if len(stack) > 1: if tType == Transition.LeftArc: # Left Arc [left_valency, right_valency] = self.get_valency(arcs, h.get_id(stack[-1])) features['transition=%d,head_left_valency=%d' % (tType, left_valency)] = 1 features['transition=%d,head_right_valency=%d' % (tType, right_valency)] = 1 elif tType == Transition.RightArc:#should probably check for right arc here! [left_valency, right_valency] = self.get_valency(arcs, h.get_id(stack[-2])) features['transition=%d,head_left_valency=%d' % (tType, left_valency)] = 1 features['transition=%d,head_right_valency=%d' % (tType, right_valency)] = 1 return features
def extract_features(self, transition, stack, buff, labels, previous_transitions, arcs, input_sentence): features = defaultdict(float) #tType = transition.transitionType tType = -1 #Dummy value since this is not encoded in the feature for SVM label = 'dummy_label' #Dummy label since this is not encoded in the feature for SVM #Model7 features as described in http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf feat21_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, -1) #pos for pre-top's lmc feat31_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, 1) #pos or pre-top's rmc feat41_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0, 0, 0, -1) #pos for top's lmc feat42_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0, 0, 0, 1) #pos for top's rmc feat5_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 1) #lex for pre-top feat6_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 0) #lex for top feat71_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.BUFF_SOURCE) #pos for next buffer item feat76_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.BUFF_SOURCE, 1) #pos for next-next buffer item feat10_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, 1) #pos for word after pre-top in input feat11_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1, -1) #pos for word before pre-top in input feat12_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 1, 1) #dep for word after pre-top in input feat13_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 1, -1) #dep for word before pre-top in input feat14_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.LEX_FEAT, self.STACK_SOURCE, 0, 1) #lex for word after top in input feat16_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0, 1) #pos for word after top in input feat17_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0, -1) #pos for word before top in input feat18_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 0, 1) #dep for word after top in input feat19_model7 = self.add_model7_feat( features, stack, buff, input_sentence, arcs, labels, tType, self.DEP_FEAT, self.STACK_SOURCE, 0, -1) #dep for word before top in input pre_top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1) top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0) cfeat_11 = self.compose_feats( features, [pre_top_pos, top_pos]) #pos of both pre_top and top cfeat_12 = self.compose_feats( features, [top_pos, feat71_model7]) #pos of top and next buff cfeat_13 = self.compose_feats(features, [top_pos, feat71_model7, feat76_model7 ]) #pos for top next and next next cfeat_17 = self.compose_feats(features, [pre_top_pos, top_pos, feat41_model7 ]) #pos for pre-top, top and top's lmc cfeat_175 = self.compose_feats(features, [pre_top_pos, top_pos, feat42_model7 ]) #pos for pre-top, top and top's rmc # Top two POS tags from the stack for i in range(3): #was originally 2 if i >= len(stack): break s = stack[-(i + 1)] pos = s[3] features['transition=%d,s%d.pos=%s' % (tType, i, pos)] = 1 # Next 2 POS tags from the buffer for i in range(2): if i >= len(buff): break b = buff[-(i + 1)] pos = b[3] features['transition=%d,b%d.pos=%s' % (tType, i, pos)] = 1 # Previous transition type if len(previous_transitions) > 0: prev = previous_transitions[-1] features['transition=%d,prev_transition=%d' % (tType, prev)] = 1 else: features['transition=%d,prev_transition=None' % (tType)] = 1 if self.labeled and transition is not None: #We don't care about labelled case and transition should not be passed for SVM # Action and label pair features['transition=%d,label=%s' % (transition.transitionType, transition.label)] = 1 # Label bias features['label=%s' % (transition.label)] = 1 #Features based on http://dl.acm.org/citation.cfm?id=2002777 #Distance function if len(stack) > 0 and len(buff) > 0: dist = h.get_id(stack[-1]) - h.get_id(buff[-1]) if dist < 0: features['transition=%d,neg_dist=' % (tType)] = dist else: features['transition=%d,pos_dist=' % (tType)] = dist #Valency function if (len(stack) > 1): [left_valency, right_valency] = self.get_valency(arcs, h.get_id(stack[-1])) left_val_feat = 'transition=%d,head_left_valency=%d' % ( tType, left_valency) features[left_val_feat + top_pos] = 1 features[left_val_feat] = 1 right_val_feat = 'transition=%d,head_right_valency=%d' % ( tType, right_valency) features[right_val_feat] = 1 features[right_val_feat + top_pos] = 1 return features
def get_model7_params( self, stack, buff, input_sentence, arcs, labels, tType, feat_type, source_type, source_offset=0, input_offset=0, head_multiplier=0, left_rightmost_multiplier=0, left_right_sibling_specifier=0, suffix_len=0 ): #Described here: http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf assert feat_type in [self.DEP_FEAT, self.POS_FEAT, self.LEX_FEAT], "Invalid feat_type specified" assert source_type in [ self.BUFF_SOURCE, self.STACK_SOURCE, self.INPUT_SOURCE ], "Invalid source type specified" assert source_offset >= 0, "Invalid source_offset" assert head_multiplier >= 0, "Invalid head multiplier specified" rev_input_sentence = input_sentence[::-1] source = self.get_source( stack, buff, input_sentence, source_type ) #Reverse the input sentence if it isn't already reversed by get_source token = self.try_get_token(source, -(source_offset + 1)) if token is None: return (None) if input_offset != 0: token = self.get_input_offset_token(token, rev_input_sentence, input_offset) if token is None: return (None) token = self.get_head_offset_token(token, rev_input_sentence, head_multiplier, arcs) if token is None: return (None) token = self.get_left_rightmost_child(token, rev_input_sentence, arcs, left_rightmost_multiplier) if token is None: return (None) token = self.get_left_right_sibling(token, rev_input_sentence, arcs, left_right_sibling_specifier) if token is None: return (None) ret_str = 'transition=%d,feat_type=%d,source_type=%d,source_offset=%d,input_offset=%d,head_multiplier=%d,left_rightmost_multiplier=%d,left_right_sibling_specifier=%d' % ( tType, feat_type, source_type, source_offset, input_offset, head_multiplier, left_rightmost_multiplier, left_right_sibling_specifier) if feat_type == self.LEX_FEAT: #suffix len can be specified via argument lex_feat = h.get_word(token) if suffix_len > 0: ret_str += 'lex_feat=%s' % (lex_feat[-suffix_len:]) else: ret_str += 'lex_feat=%s' % (lex_feat) elif feat_type == self.DEP_FEAT: dep_feat = labels.get(h.get_id(token), None) if dep_feat is not None: ret_str += 'dep_feat=%s' % (dep_feat) else: return (None) elif feat_type == self.POS_FEAT: pos_feat = h.get_postag(token) ret_str += 'pos_feat=%s' % (pos_feat) else: return (None) return (ret_str)
def get_input_offset_token(self, token, input_sentence, input_offset): if h.get_id(token) + input_offset <= 0: #To prevent wraparound return(None) return(self.try_get_token(input_sentence, -(h.get_id(token) + input_offset))) # No +1 since the token id starts from 1 instead of 0