示例#1
0
 def get_input_offset_token(self, token, input_sentence,
                            input_offset):  #CONFIRM THIS!!!
     if h.get_id(token) + input_offset <= 0:  #To prevent wraparound
         return (None)
     return (self.try_get_token(input_sentence,
                                -(h.get_id(token) + input_offset))
             )  # No +1 since the token id starts from 1 instead of 0
示例#2
0
 def get_all_siblings(self, token, input_sentence, arcs):
     if arcs.get(h.get_id(token), None) is None:
         return([])
     all_siblings = []
     for word in input_sentence:
         if h.get_id(word) != 0 and h.get_id(word) != h.get_id(token) and arcs.get(h.get_id(word), None) is not None and arcs[h.get_id(word)] == arcs[h.get_id(token)]:
             all_siblings += [word]
     return(all_siblings)
示例#3
0
 def output(self, sentence):
     for token in sentence:
         head = self.arcs.get(h.get_id(token), '0')
         label = self.labels.get(h.get_id(token), '_')
         label = label if label is not None else '_'
         token[6] = str(head)
         token[7] = str(label)
         print '\t'.join(token)
     print
示例#4
0
 def output(self, sentence):
     for token in sentence:
         head = self.arcs.get(h.get_id(token), '0')
         label = self.labels.get(h.get_id(token), '_')
         label = label if label is not None else '_'
         token[6] = str(head)
         token[7] = str(label)
         print '\t'.join(token)
     print
示例#5
0
 def get_all_children(self, token, input_sentence, arcs):
     all_children = {}
     head_id = h.get_id(token)
     for tail in arcs.keys():
         if arcs[tail] == head_id:
             child_token = self.try_get_token(input_sentence, -(tail))
             if child_token is not None:
                 all_children[h.get_id(child_token)] = child_token
             else:
                 print >>sys.stderr, 'Non-existent child, should NOT happen!!!'
     return(all_children)
示例#6
0
 def get_all_children(self, token, input_sentence, arcs):
     all_children = {}
     head_id = h.get_id(token)
     for tail in arcs.keys():
         if arcs[tail] == head_id:
             child_token = self.try_get_token(input_sentence, -(tail))
             if child_token is not None:
                 all_children[h.get_id(child_token)] = child_token
             else:
                 print >> sys.stderr, 'Non-existent child, should NOT happen!!!'
     return (all_children)
示例#7
0
 def get_head_offset_token(self, token, input_sentence, head_multiplier, arcs):
     while (token is not None and head_multiplier > 0):
         head_multiplier -= 1
         token_id = h.get_id(token)
         head_id = arcs.get(token_id, None)
         if head_id == None:
             token = None
         else:
             token = self.try_get_token(input_sentence, -head_id)
     return(token)
示例#8
0
 def get_head_offset_token(self, token, input_sentence, head_multiplier,
                           arcs):
     while (token is not None and head_multiplier > 0):
         head_multiplier -= 1
         token_id = h.get_id(token)
         head_id = arcs.get(token_id, None)
         if head_id == None:
             token = None
         else:
             token = self.try_get_token(input_sentence, -head_id)
     return (token)
示例#9
0
 def execute_transition(self, transition):
     """This function should take a transition object and apply to the
 	current parser state. It need not return anything."""
     self.transitions.append(transition.transitionType)
     if (transition.transitionType == Transition.Shift):
         self.stack.append(self.buff.pop())
     elif (transition.transitionType == Transition.LeftArc):
         top = self.stack.pop()
         top_id = h.get_id(top)
         pre_top = self.stack.pop()
         pre_top_id = h.get_id(pre_top)
         self.stack.append(top)
         self.arcs[pre_top_id] = top_id
         self.labels[pre_top_id] = transition.label
     else:
         top = self.stack.pop()
         top_id = h.get_id(top)
         pre_top = self.stack[-1]
         pre_top_id = h.get_id(pre_top)
         self.arcs[top_id] = pre_top_id
         self.labels[top_id] = transition.label
示例#10
0
 def execute_transition(self, transition):
     """This function should take a transition object and apply to the
 	current parser state. It need not return anything."""
     self.transitions.append(transition.transitionType)
     if (transition.transitionType == Transition.Shift):
         self.stack.append(self.buff.pop())
     elif (transition.transitionType == Transition.LeftArc):
         top = self.stack.pop()
         top_id= h.get_id(top)
         pre_top = self.stack.pop()
         pre_top_id = h.get_id(pre_top)
         self.stack.append(top)
         self.arcs[pre_top_id] = top_id
         self.labels[pre_top_id] = transition.label
     else:
         top = self.stack.pop()
         top_id = h.get_id(top)
         pre_top = self.stack[-1]
         pre_top_id = h.get_id(pre_top)
         self.arcs[top_id] = pre_top_id
         self.labels[top_id] = transition.label
示例#11
0
 def getTransition(self, stack, buff, leftmostChildren, rightmostChildren,
                   arcs, labeled):
     """This function should return a Transition object representing the correct action to
     to take according to the oracle."""
     if len(stack) > 1:
         top = stack[-1]
         pre_top = stack[-2]
         rmc_top = rightmostChildren.get(p.get_id(top), -1)
         rmc_pre_top = rightmostChildren.get(p.get_id(pre_top), -1)
         lmc_top = leftmostChildren.get(p.get_id(top), p.INFINITY)
         lmc_pre_top = leftmostChildren.get(p.get_id(pre_top), p.INFINITY)
         if (p.get_head(pre_top) == p.get_id(top) and self.is_removable(
                 pre_top, arcs, lmc_pre_top, rmc_pre_top)):
             if labeled:
                 return (Transition(Transition.LeftArc,
                                    p.get_deprel(pre_top)))
             else:
                 return (Transition(Transition.LeftArc, None))
         elif (p.get_head(top) == p.get_id(pre_top)
               and self.is_removable(top, arcs, lmc_top, rmc_top)):
             if labeled:
                 return (Transition(Transition.RightArc, p.get_deprel(top)))
             else:
                 return (Transition(Transition.RightArc, None))
         else:
             return (Transition(Transition.Shift, None))
     else:
         if len(buff) >= 1:
             return (Transition(Transition.Shift, None))
         else:
             return (None)
示例#12
0
 def getTransition(self, stack, buff, leftmostChildren, rightmostChildren, arcs, labeled):
     """This function should return a Transition object representing the correct action to
     to take according to the oracle."""
     if len(stack) > 1:
         top = stack[-1]
         pre_top = stack[-2]
         rmc_top = rightmostChildren.get(p.get_id(top), -1)
         rmc_pre_top = rightmostChildren.get(p.get_id(pre_top), -1)
         lmc_top = leftmostChildren.get(p.get_id(top), p.INFINITY)
         lmc_pre_top = leftmostChildren.get(p.get_id(pre_top), p.INFINITY)
         if ( p.get_head(pre_top) == p.get_id(top) and self.is_removable(pre_top, arcs, lmc_pre_top, rmc_pre_top) ):
             if labeled:
                 return(Transition(Transition.LeftArc, p.get_deprel(pre_top)))
             else:
                 return(Transition(Transition.LeftArc, None))
         elif ( p.get_head(top) == p.get_id(pre_top) and self.is_removable(top, arcs, lmc_top, rmc_top) ):
             if labeled:
                 return(Transition(Transition.RightArc, p.get_deprel(top)))
             else:
                 return(Transition(Transition.RightArc, None))
         else:
             return(Transition(Transition.Shift, None))
     else:
         if len(buff) >= 1:
             return(Transition(Transition.Shift, None))
         else:
             return(None)
示例#13
0
 def get_all_siblings(self, token, input_sentence, arcs):
     if arcs.get(h.get_id(token), None) is None:
         return ([])
     all_siblings = []
     for word in input_sentence:
         if h.get_id(word) != 0 and h.get_id(
                 word) != h.get_id(token) and arcs.get(
                     h.get_id(word), None) is not None and arcs[h.get_id(
                         word)] == arcs[h.get_id(token)]:
             all_siblings += [word]
     return (all_siblings)
示例#14
0
 def get_rightmost_child(self, token, input_sentence, arcs, rightmost_multiplier):
     assert rightmost_multiplier > 0, "Invalid rightmost_multiplier passed"
     while token is not None and rightmost_multiplier > 0:
         rightmost_multiplier -= 1
         all_children = self.get_all_children(token, input_sentence, arcs)
         if len(all_children) == 0 :
             return(None)
         max_candidate = max(all_children.keys())
         if (max_candidate > h.get_id(token)):
             token = all_children[max_candidate]
         else:
             return(None)
     return(token)
示例#15
0
 def get_leftmost_child(self, token, input_sentence, arcs, leftmost_multiplier):
     assert leftmost_multiplier < 0, "Invalid leftmost_multiplier passed"
     while token is not None and leftmost_multiplier < 0:
         leftmost_multiplier += 1
         all_children = self.get_all_children(token, input_sentence, arcs)
         if len(all_children) == 0 :
             return(None)
         min_candidate = min(all_children.keys())
         if (min_candidate < h.get_id(token)):
             token = all_children[min_candidate]
         else:
             return(None)
     return(token)
示例#16
0
 def get_rightmost_child(self, token, input_sentence, arcs,
                         rightmost_multiplier):
     assert rightmost_multiplier > 0, "Invalid rightmost_multiplier passed"
     while token is not None and rightmost_multiplier > 0:
         rightmost_multiplier -= 1
         all_children = self.get_all_children(token, input_sentence, arcs)
         if len(all_children) == 0:
             return (None)
         max_candidate = max(all_children.keys())
         if (max_candidate > h.get_id(token)):
             token = all_children[max_candidate]
         else:
             return (None)
     return (token)
示例#17
0
 def get_leftmost_child(self, token, input_sentence, arcs,
                        leftmost_multiplier):
     assert leftmost_multiplier < 0, "Invalid leftmost_multiplier passed"
     while token is not None and leftmost_multiplier < 0:
         leftmost_multiplier += 1
         all_children = self.get_all_children(token, input_sentence, arcs)
         if len(all_children) == 0:
             return (None)
         min_candidate = min(all_children.keys())
         if (min_candidate < h.get_id(token)):
             token = all_children[min_candidate]
         else:
             return (None)
     return (token)
示例#18
0
    def get_model7_params(self, stack, buff, input_sentence, arcs, labels, tType, feat_type, source_type, source_offset = 0, input_offset = 0, head_multiplier = 0, left_rightmost_multiplier = 0, left_right_sibling_specifier = 0, suffix_len = 0): #Described here: http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf
        assert feat_type in [self.DEP_FEAT, self.POS_FEAT, self.LEX_FEAT], "Invalid feat_type specified"
        assert source_type in [self.BUFF_SOURCE, self.STACK_SOURCE, self.INPUT_SOURCE], "Invalid source type specified"
        assert source_offset >= 0, "Invalid source_offset"
        assert head_multiplier >= 0, "Invalid head multiplier specified"
        rev_input_sentence = input_sentence[::-1]
        source = self.get_source(stack, buff, input_sentence, source_type)#Reverse the input sentence if it isn't already reversed by get_source

        token = self.try_get_token(source, -(source_offset + 1))
        if token is None:
            return(None)
        if input_offset != 0:
            token = self.get_input_offset_token(token, rev_input_sentence, input_offset)
        if token is None:
            return(None)
        token = self.get_head_offset_token(token, rev_input_sentence, head_multiplier, arcs)
        if token is None:
            return(None)
        token = self.get_left_rightmost_child(token, rev_input_sentence, arcs, left_rightmost_multiplier)
        if token is None:
            return(None)
        token = self.get_left_right_sibling(token, rev_input_sentence, arcs, left_right_sibling_specifier)
        if token is None:
            return(None)
        ret_str = 'transition=%d,feat_type=%d,source_type=%d,source_offset=%d,input_offset=%d,head_multiplier=%d,left_rightmost_multiplier=%d,left_right_sibling_specifier=%d' % (tType, feat_type, source_type, source_offset, input_offset, head_multiplier , left_rightmost_multiplier, left_right_sibling_specifier)
        if feat_type == self.LEX_FEAT:
            #suffix len can be specified via argument
            lex_feat = h.get_word(token)
            if suffix_len > 0:
                ret_str += 'lex_feat=%s' %(lex_feat[-suffix_len:])
            else:
                ret_str += 'lex_feat=%s' %(lex_feat)
        elif feat_type == self.DEP_FEAT:
            dep_feat = labels.get(h.get_id(token), None)
            if  dep_feat is not None:
                ret_str += 'dep_feat=%s' %(dep_feat)
            else:
                return(None)
        elif feat_type == self.POS_FEAT:
            pos_feat = h.get_postag(token)
            ret_str += 'pos_feat=%s' % (pos_feat)
        else:
            return(None)
        return(ret_str)
示例#19
0
 def get_right_sibling(self, token, input_sentence, arcs, right_sibling_multiplier):
     assert right_sibling_multiplier > 0, "Invalid right sibling multiplier"
     all_siblings = self.get_all_siblings(token, input_sentence, arcs)
     while(token is not None and right_sibling_multiplier > 0):
         right_sibling_multiplier -= 1
         if len(all_siblings) == 0:
             return(None)
         min_dist = h.INFINITY
         nearest_sibling = None
         for sibling in all_siblings:
             if h.get_id(sibling) > h.get_id(token) and abs(h.get_id(sibling) - h.get_id(token)) < min_dist:
                 min_dist = abs(h.get_id(sibling) - h.get_id(token))
                 nearest_sibling = sibling
         token = nearest_sibling
         # NOTE: It is possible NOT that we keep cycling between siblings.
     return(token)
示例#20
0
 def get_right_sibling(self, token, input_sentence, arcs,
                       right_sibling_multiplier):
     assert right_sibling_multiplier > 0, "Invalid right sibling multiplier"
     all_siblings = self.get_all_siblings(token, input_sentence, arcs)
     while (token is not None and right_sibling_multiplier > 0):
         right_sibling_multiplier -= 1
         if len(all_siblings) == 0:
             return (None)
         min_dist = h.INFINITY
         nearest_sibling = None
         for sibling in all_siblings:
             if h.get_id(sibling) > h.get_id(token) and abs(
                     h.get_id(sibling) - h.get_id(token)) < min_dist:
                 min_dist = abs(h.get_id(sibling) - h.get_id(token))
                 nearest_sibling = sibling
         token = nearest_sibling
         # NOTE: It is possible NOT that we keep cycling between siblings.
     return (token)
示例#21
0
    def extract_features(self, transition, stack, buff, labels,
                         previous_transitions, arcs, input_sentence):
        features = defaultdict(float)
        tType = transition.transitionType
        label = transition.label

        #Model7 features as described in http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf

        feat1_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.DEP_FEAT,
                                            self.STACK_SOURCE,
                                            1)  #dep for pre-top
        feat2_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.DEP_FEAT,
                                            self.STACK_SOURCE, 1, 0, 0,
                                            -1)  #dep for pre-top's lmc
        feat21_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 1, 0, 0,
                                             -1)  #pos for pre-top's lmc
        feat3_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.DEP_FEAT,
                                            self.STACK_SOURCE, 1, 0, 0,
                                            1)  #dep or pre-top's rmc
        feat31_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 1, 0, 0,
                                             1)  #pos or pre-top's rmc
        feat4_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.DEP_FEAT,
                                            self.STACK_SOURCE, 0, 0, 0,
                                            -1)  #dep for top's lmc
        feat41_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 0, 0, 0,
                                             -1)  #pos for top's lmc
        feat5_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.LEX_FEAT,
                                            self.STACK_SOURCE,
                                            1)  #lex for pre-top
        feat6_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.LEX_FEAT,
                                            self.STACK_SOURCE, 0)  #lex for top

        feat7_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.LEX_FEAT, self.BUFF_SOURCE)  #lex for next buffer item
        feat71_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.BUFF_SOURCE)  #pos for next buffer item

        feat75_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.LEX_FEAT,
                                             self.BUFF_SOURCE,
                                             1)  #lex for next-next buffer item
        feat76_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.BUFF_SOURCE,
                                             1)  #pos for next-next buffer item

        feat8_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.LEX_FEAT, self.STACK_SOURCE, 1,
            1)  #lex for word after pre-top in input
        feat9_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.LEX_FEAT, self.STACK_SOURCE, 1,
            -1)  #lex for word before pre-top in input

        feat10_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 1,
            1)  #pos for word after pre-top in input
        feat11_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 1,
            -1)  #pos for word before pre-top in input

        feat12_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 1,
            1)  #dep for word after pre-top in input
        feat13_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 1,
            -1)  #dep for word before pre-top in input

        feat14_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.LEX_FEAT, self.STACK_SOURCE, 0,
            1)  #lex for word after top in input
        feat15_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.LEX_FEAT, self.STACK_SOURCE, 0,
            -1)  #lex for word before top in input

        feat16_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 0,
            1)  #pos for word after top in input
        feat17_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 0,
            -1)  #pos for word before top in input

        feat18_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 0,
            1)  #dep for word after top in input
        feat19_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 0,
            -1)  #dep for word before top in input

        pre_top_pos = self.get_model7_params(stack, buff, input_sentence, arcs,
                                             labels, tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 1)
        top_pos = self.get_model7_params(stack, buff, input_sentence, arcs,
                                         labels, tType, self.POS_FEAT,
                                         self.STACK_SOURCE, 0)
        cfeat_1 = self.compose_feats(
            features, [feat5_model7, pre_top_pos])  #lex_pos of pre-top
        cfeat_2 = self.compose_feats(features,
                                     [feat6_model7, top_pos])  #lex_pos for top

        cfeat_3 = self.compose_feats(
            features,
            [feat7_model7, feat71_model7])  #lex_pos for next buffer item
        cfeat_4 = self.compose_feats(
            features,
            [feat75_model7, feat76_model7])  #lex_pos for next-next buffer item

        cfeat_5 = self.compose_feats(
            features, [cfeat_1, cfeat_2])  #lex_pos for both pre-top and top
        cfeat_6 = self.compose_feats(
            features,
            [cfeat_1, feat6_model7])  #lex_pos of pre-top with lex of top
        cfeat_7 = self.compose_feats(
            features,
            [feat5_model7, cfeat_2])  #lex of pre-top with lex_pos of top
        cfeat_8 = self.compose_feats(
            features, [cfeat_1, top_pos])  #lex_pos of pre-top with pos of top
        cfeat_9 = self.compose_feats(
            features,
            [pre_top_pos, cfeat_2])  #pos of pre-top with lex_pos of top
        cfeat_10 = self.compose_feats(
            features,
            [feat5_model7, feat6_model7])  #lex of both pre_top and top
        cfeat_11 = self.compose_feats(
            features, [pre_top_pos, top_pos])  #pos of both pre_top and top
        cfeat_12 = self.compose_feats(
            features, [top_pos, feat71_model7])  #pos of top and next buff
        cfeat_13 = self.compose_feats(features,
                                      [top_pos, feat71_model7, feat76_model7
                                       ])  #pos for top next and next next
        cfeat_14 = self.compose_feats(features,
                                      [pre_top_pos, top_pos, feat71_model7
                                       ])  #pos for pre-top, top and next
        cfeat_15 = self.compose_feats(features,
                                      [pre_top_pos, feat21_model7, top_pos
                                       ])  #pos for pre-top pre top lmc and top
        cfeat_16 = self.compose_feats(
            features, [pre_top_pos, feat31_model7, top_pos
                       ])  #pos for pre-top, pre-top rmc and top
        cfeat_17 = self.compose_feats(features,
                                      [pre_top_pos, top_pos, feat41_model7
                                       ])  #pos for pre-top, top and top's lmc

        # Top two POS tags from the stack
        for i in range(3):  #was originally 2
            if i >= len(stack):
                break
            s = stack[-(i + 1)]
            pos = s[3]
            features['transition=%d,s%d.pos=%s' % (tType, i, pos)] = 1

        # Next four POS tags from the buffer
        for i in range(3):
            if i >= len(buff):
                break
            b = buff[-(i + 1)]
            pos = b[3]
            features['transition=%d,b%d.pos=%s' % (tType, i, pos)] = 1

        # Previous transition type
        if len(previous_transitions) > 0:
            prev = previous_transitions[-1]
            features['transition=%d,prev_transition=%d' % (tType, prev)] = 1
        else:
            features['transition=%d,prev_transition=None' % (tType)] = 1

        # Bias feature
        features['transition=%d' % (transition.transitionType)] = 1

        if self.labeled:
            # Action and label pair
            features['transition=%d,label=%s' %
                     (transition.transitionType, transition.label)] = 1
            # Label bias
            features['label=%s' % (transition.label)] = 1

        #Features based on http://dl.acm.org/citation.cfm?id=2002777
        #Distance function
        if len(stack) > 0 and len(buff) > 0:
            dist = h.get_id(stack[-1]) - h.get_id(buff[-1])
            if dist < 0:
                features['transition=%d,neg_dist=' % (tType)] = dist
            else:
                features['transition=%d,pos_dist=' % (tType)] = dist

        #Valency function
        if len(stack) > 1:
            if tType == Transition.LeftArc:  # Left Arc
                [left_valency,
                 right_valency] = self.get_valency(arcs, h.get_id(stack[-1]))
                features['transition=%d,head_left_valency=%d' %
                         (tType, left_valency)] = 1
                features['transition=%d,head_right_valency=%d' %
                         (tType, right_valency)] = 1
            elif tType == Transition.RightArc:  #should probably check for right arc here!
                [left_valency,
                 right_valency] = self.get_valency(arcs, h.get_id(stack[-2]))
                features['transition=%d,head_left_valency=%d' %
                         (tType, left_valency)] = 1
                features['transition=%d,head_right_valency=%d' %
                         (tType, right_valency)] = 1
        return features
示例#22
0
    def extract_features(self, transition, stack, buff, labels, previous_transitions, arcs, input_sentence):
        features = defaultdict(float)
        tType = transition.transitionType
        label = transition.label

        #Model7 features as described in http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf

        feat1_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1)#dep for pre-top
        feat2_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, 0, 0, -1) #dep for pre-top's lmc
        feat21_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, -1) #pos for pre-top's lmc
        feat3_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, 0, 0, 1)#dep or pre-top's rmc
        feat31_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, 0, 0, 1)#pos or pre-top's rmc
        feat4_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 0, 0, 0, -1)#dep for top's lmc
        feat41_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 0, 0, 0, -1)#pos for top's lmc
        feat5_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 1)#lex for pre-top
        feat6_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 0)#lex for top

        feat7_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.BUFF_SOURCE)#lex for next buffer item
        feat71_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.BUFF_SOURCE)#pos for next buffer item

        feat75_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.BUFF_SOURCE, 1)#lex for next-next buffer item
        feat76_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.BUFF_SOURCE, 1)#pos for next-next buffer item

        feat8_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 1, 1)#lex for word after pre-top in input
        feat9_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 1, -1)#lex for word before pre-top in input

        feat10_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, 1)#pos for word after pre-top in input
        feat11_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 1, -1)#pos for word before pre-top in input

        feat12_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, 1)#dep for word after pre-top in input
        feat13_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 1, -1)#dep for word before pre-top in input

        feat14_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 0, 1)#lex for word after top in input
        feat15_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.LEX_FEAT, self.STACK_SOURCE, 0, -1)#lex for word before top in input

        feat16_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 0, 1)#pos for word after top in input
        feat17_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.POS_FEAT, self.STACK_SOURCE, 0, -1)#pos for word before top in input

        feat18_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 0, 1)#dep for word after top in input
        feat19_model7 = self.add_model7_feat(features, stack, buff, input_sentence, arcs, labels, tType,self.DEP_FEAT, self.STACK_SOURCE, 0, -1)#dep for word before top in input

        pre_top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 1)
        top_pos = self.get_model7_params(stack, buff, input_sentence, arcs, labels, tType, self.POS_FEAT, self.STACK_SOURCE, 0)
        cfeat_1 = self.compose_feats(features, [feat5_model7, pre_top_pos])#lex_pos of pre-top
        cfeat_2 = self.compose_feats(features, [feat6_model7, top_pos])#lex_pos for top

        cfeat_3 = self.compose_feats(features, [feat7_model7, feat71_model7])#lex_pos for next buffer item
        cfeat_4 = self.compose_feats(features, [feat75_model7,feat76_model7])#lex_pos for next-next buffer item

        cfeat_5 = self.compose_feats(features, [cfeat_1, cfeat_2])#lex_pos for both pre-top and top
        cfeat_6 = self.compose_feats(features, [cfeat_1, feat6_model7])#lex_pos of pre-top with lex of top
        cfeat_7 = self.compose_feats(features, [feat5_model7, cfeat_2])#lex of pre-top with lex_pos of top
        cfeat_8 = self.compose_feats(features, [cfeat_1, top_pos])#lex_pos of pre-top with pos of top
        cfeat_9 = self.compose_feats(features, [pre_top_pos, cfeat_2]) #pos of pre-top with lex_pos of top
        cfeat_10 = self.compose_feats(features, [feat5_model7, feat6_model7])#lex of both pre_top and top
        cfeat_11 = self.compose_feats(features, [pre_top_pos, top_pos])#pos of both pre_top and top
        cfeat_12 = self.compose_feats(features, [top_pos, feat71_model7])#pos of top and next buff
        cfeat_13 = self.compose_feats(features, [top_pos, feat71_model7, feat76_model7])#pos for top next and next next
        cfeat_14 = self.compose_feats(features, [pre_top_pos, top_pos, feat71_model7])#pos for pre-top, top and next
        cfeat_15 = self.compose_feats(features, [pre_top_pos, feat21_model7, top_pos])#pos for pre-top pre top lmc and top
        cfeat_16 = self.compose_feats(features, [pre_top_pos, feat31_model7, top_pos])#pos for pre-top, pre-top rmc and top
        cfeat_17 = self.compose_feats(features, [pre_top_pos, top_pos, feat41_model7])#pos for pre-top, top and top's lmc


        # Top two POS tags from the stack
        for i in range(3):#was originally 2
            if i >= len(stack):
                break
            s = stack[-(i+1)]
            pos = s[3]
            features['transition=%d,s%d.pos=%s' % (tType, i, pos)] = 1

        # Next four POS tags from the buffer
        for i in range(3):
            if i >= len(buff):
                break
            b = buff[-(i+1)]
            pos = b[3]
            features['transition=%d,b%d.pos=%s' % (tType, i, pos)] = 1

        # Previous transition type
        if len(previous_transitions) > 0:
            prev = previous_transitions[-1]
            features['transition=%d,prev_transition=%d' % (tType, prev)] = 1
        else:
            features['transition=%d,prev_transition=None' % (tType)] = 1

        # Bias feature
        features['transition=%d' % (transition.transitionType)] = 1

        if self.labeled:
            # Action and label pair
            features['transition=%d,label=%s' % (transition.transitionType, transition.label)] = 1
            # Label bias
            features['label=%s' % (transition.label)] = 1

        #Features based on http://dl.acm.org/citation.cfm?id=2002777
        #Distance function
        if len(stack) > 0 and len(buff) > 0:
            dist = h.get_id(stack[-1]) - h.get_id(buff[-1])
            if dist < 0:
                features['transition=%d,neg_dist=' % (tType)] = dist
            else:
                features['transition=%d,pos_dist=' % (tType)] = dist

        #Valency function
        if len(stack) > 1:
            if tType == Transition.LeftArc: # Left Arc
                [left_valency, right_valency] = self.get_valency(arcs, h.get_id(stack[-1]))
                features['transition=%d,head_left_valency=%d' % (tType, left_valency)] = 1
                features['transition=%d,head_right_valency=%d' % (tType, right_valency)] = 1
            elif tType == Transition.RightArc:#should probably check for right arc here!
                [left_valency, right_valency] = self.get_valency(arcs, h.get_id(stack[-2]))
                features['transition=%d,head_left_valency=%d' % (tType, left_valency)] = 1
                features['transition=%d,head_right_valency=%d' % (tType, right_valency)] = 1
        return features
示例#23
0
    def extract_features(self, transition, stack, buff, labels,
                         previous_transitions, arcs, input_sentence):
        features = defaultdict(float)
        #tType = transition.transitionType
        tType = -1  #Dummy value since this is not encoded in the feature for SVM
        label = 'dummy_label'  #Dummy label since this is not encoded in the feature for SVM

        #Model7 features as described in http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf

        feat21_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 1, 0, 0,
                                             -1)  #pos for pre-top's lmc
        feat31_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 1, 0, 0,
                                             1)  #pos or pre-top's rmc
        feat41_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 0, 0, 0,
                                             -1)  #pos for top's lmc
        feat42_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 0, 0, 0,
                                             1)  #pos for top's rmc
        feat5_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.LEX_FEAT,
                                            self.STACK_SOURCE,
                                            1)  #lex for pre-top
        feat6_model7 = self.add_model7_feat(features, stack, buff,
                                            input_sentence, arcs, labels,
                                            tType, self.LEX_FEAT,
                                            self.STACK_SOURCE, 0)  #lex for top

        feat71_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.BUFF_SOURCE)  #pos for next buffer item

        feat76_model7 = self.add_model7_feat(features, stack, buff,
                                             input_sentence, arcs, labels,
                                             tType, self.POS_FEAT,
                                             self.BUFF_SOURCE,
                                             1)  #pos for next-next buffer item

        feat10_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 1,
            1)  #pos for word after pre-top in input
        feat11_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 1,
            -1)  #pos for word before pre-top in input

        feat12_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 1,
            1)  #dep for word after pre-top in input
        feat13_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 1,
            -1)  #dep for word before pre-top in input

        feat14_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.LEX_FEAT, self.STACK_SOURCE, 0,
            1)  #lex for word after top in input

        feat16_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 0,
            1)  #pos for word after top in input
        feat17_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.POS_FEAT, self.STACK_SOURCE, 0,
            -1)  #pos for word before top in input

        feat18_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 0,
            1)  #dep for word after top in input
        feat19_model7 = self.add_model7_feat(
            features, stack, buff, input_sentence, arcs, labels, tType,
            self.DEP_FEAT, self.STACK_SOURCE, 0,
            -1)  #dep for word before top in input

        pre_top_pos = self.get_model7_params(stack, buff, input_sentence, arcs,
                                             labels, tType, self.POS_FEAT,
                                             self.STACK_SOURCE, 1)
        top_pos = self.get_model7_params(stack, buff, input_sentence, arcs,
                                         labels, tType, self.POS_FEAT,
                                         self.STACK_SOURCE, 0)

        cfeat_11 = self.compose_feats(
            features, [pre_top_pos, top_pos])  #pos of both pre_top and top

        cfeat_12 = self.compose_feats(
            features, [top_pos, feat71_model7])  #pos of top and next buff

        cfeat_13 = self.compose_feats(features,
                                      [top_pos, feat71_model7, feat76_model7
                                       ])  #pos for top next and next next

        cfeat_17 = self.compose_feats(features,
                                      [pre_top_pos, top_pos, feat41_model7
                                       ])  #pos for pre-top, top and top's lmc
        cfeat_175 = self.compose_feats(features,
                                       [pre_top_pos, top_pos, feat42_model7
                                        ])  #pos for pre-top, top and top's rmc

        # Top two POS tags from the stack
        for i in range(3):  #was originally 2
            if i >= len(stack):
                break
            s = stack[-(i + 1)]
            pos = s[3]
            features['transition=%d,s%d.pos=%s' % (tType, i, pos)] = 1

        # Next 2 POS tags from the buffer
        for i in range(2):
            if i >= len(buff):
                break
            b = buff[-(i + 1)]
            pos = b[3]
            features['transition=%d,b%d.pos=%s' % (tType, i, pos)] = 1

        # Previous transition type
        if len(previous_transitions) > 0:
            prev = previous_transitions[-1]
            features['transition=%d,prev_transition=%d' % (tType, prev)] = 1
        else:
            features['transition=%d,prev_transition=None' % (tType)] = 1

        if self.labeled and transition is not None:  #We don't care about labelled case and transition should not be passed for SVM
            # Action and label pair
            features['transition=%d,label=%s' %
                     (transition.transitionType, transition.label)] = 1
            # Label bias
            features['label=%s' % (transition.label)] = 1

        #Features based on http://dl.acm.org/citation.cfm?id=2002777
        #Distance function
        if len(stack) > 0 and len(buff) > 0:
            dist = h.get_id(stack[-1]) - h.get_id(buff[-1])
            if dist < 0:
                features['transition=%d,neg_dist=' % (tType)] = dist
            else:
                features['transition=%d,pos_dist=' % (tType)] = dist

        #Valency function
        if (len(stack) > 1):
            [left_valency,
             right_valency] = self.get_valency(arcs, h.get_id(stack[-1]))
            left_val_feat = 'transition=%d,head_left_valency=%d' % (
                tType, left_valency)
            features[left_val_feat + top_pos] = 1
            features[left_val_feat] = 1
            right_val_feat = 'transition=%d,head_right_valency=%d' % (
                tType, right_valency)
            features[right_val_feat] = 1
            features[right_val_feat + top_pos] = 1

        return features
示例#24
0
    def get_model7_params(
        self,
        stack,
        buff,
        input_sentence,
        arcs,
        labels,
        tType,
        feat_type,
        source_type,
        source_offset=0,
        input_offset=0,
        head_multiplier=0,
        left_rightmost_multiplier=0,
        left_right_sibling_specifier=0,
        suffix_len=0
    ):  #Described here: http://stp.lingfil.uu.se/~nivre/docs/maltparser.pdf
        assert feat_type in [self.DEP_FEAT, self.POS_FEAT,
                             self.LEX_FEAT], "Invalid feat_type specified"
        assert source_type in [
            self.BUFF_SOURCE, self.STACK_SOURCE, self.INPUT_SOURCE
        ], "Invalid source type specified"
        assert source_offset >= 0, "Invalid source_offset"
        assert head_multiplier >= 0, "Invalid head multiplier specified"
        rev_input_sentence = input_sentence[::-1]
        source = self.get_source(
            stack, buff, input_sentence, source_type
        )  #Reverse the input sentence if it isn't already reversed by get_source

        token = self.try_get_token(source, -(source_offset + 1))
        if token is None:
            return (None)
        if input_offset != 0:
            token = self.get_input_offset_token(token, rev_input_sentence,
                                                input_offset)
        if token is None:
            return (None)
        token = self.get_head_offset_token(token, rev_input_sentence,
                                           head_multiplier, arcs)
        if token is None:
            return (None)
        token = self.get_left_rightmost_child(token, rev_input_sentence, arcs,
                                              left_rightmost_multiplier)
        if token is None:
            return (None)
        token = self.get_left_right_sibling(token, rev_input_sentence, arcs,
                                            left_right_sibling_specifier)
        if token is None:
            return (None)
        ret_str = 'transition=%d,feat_type=%d,source_type=%d,source_offset=%d,input_offset=%d,head_multiplier=%d,left_rightmost_multiplier=%d,left_right_sibling_specifier=%d' % (
            tType, feat_type, source_type, source_offset, input_offset,
            head_multiplier, left_rightmost_multiplier,
            left_right_sibling_specifier)
        if feat_type == self.LEX_FEAT:
            #suffix len can be specified via argument
            lex_feat = h.get_word(token)
            if suffix_len > 0:
                ret_str += 'lex_feat=%s' % (lex_feat[-suffix_len:])
            else:
                ret_str += 'lex_feat=%s' % (lex_feat)
        elif feat_type == self.DEP_FEAT:
            dep_feat = labels.get(h.get_id(token), None)
            if dep_feat is not None:
                ret_str += 'dep_feat=%s' % (dep_feat)
            else:
                return (None)
        elif feat_type == self.POS_FEAT:
            pos_feat = h.get_postag(token)
            ret_str += 'pos_feat=%s' % (pos_feat)
        else:
            return (None)
        return (ret_str)
示例#25
0
 def get_input_offset_token(self, token, input_sentence, input_offset):
     if h.get_id(token) + input_offset <= 0: #To prevent wraparound
         return(None)
     return(self.try_get_token(input_sentence, -(h.get_id(token) + input_offset))) # No +1 since the token id starts from 1 instead of 0