def clusterfix(self, top, pp, p, s, t): debug("Fixing argument cluster coordination: %s", pprint(top)) debug('T: %s', t) # 1. Shrink the verb (node T) self.fix_object_gap(pp, p, t, s) # 2. Reattach the verb above the TOP node new_node = Node('TAG', top.kids, top.category, head_index=0) top.kids = [t, new_node] # (Reattaching parent pointers) for kid in new_node: kid.parent = new_node # 3. Find and relabel argument clusters for node, ctx in find_all(top, r'/VP/=VP <1 /NP/=NP <2 /(QP|V[PV])/=QP', with_context=True): vp, np, qp = ctx.vp, ctx.np, ctx.qp # Now, VP should have category ((S[dcl]\NP)/QP)/NP SbNP = t.category.left.left QP, NP = qp.category, np.category # NP should have category ((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP) new_np_category = (SbNP / QP) | ((SbNP / QP) / NP) # QP should have category ((S[dcl]\NP)\((S[dcl]\NP)/QP)) new_qp_category = (SbNP) | ((SbNP) / QP) # insert unary nodes new_np_node = Node(np.tag, [np], new_np_category, head_index=0) np.parent = new_np_node new_qp_node = Node(qp.tag, [qp], new_qp_category, head_index=0) qp.parent = new_qp_node replace_kid(vp, np, new_np_node) replace_kid(vp, qp, new_qp_node) self.fix_categories_starting_from(new_np_node, top)
def fix_ip_app(self, p, a, s): debug("Fixing IP-APP NX: %s", lrp_repr(p)) new_kid = copy(a) new_kid.tag = base_tag( new_kid.tag) # relabel to stop infinite matching replace_kid( p, a, Node("NN", [new_kid], s.category / s.category, head_index=0))
def fix_nongap_extraction(self, _, n, pred, k): node = n debug("Fixing nongap extraction: %s", pprint(node)) debug("k %s", pprint(k)) self.remove_null_element(node) index = get_trace_index_from_tag(k.tag) expr = ( r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }' % { 'tags': ModifierTagsRegex, 'index': index }) # we use "<<" in the expression, because fix_*_topicalisation comes # before fix_nongap_extraction, and this can introduce an extra layer between # the phrasal tag and the trace for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) if not self.relabel_relativiser(pred): top, context = get_first(node, r'/[ICV]P/=TOP $ *=SS', with_context=True) ss = context.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def fix_topicalisation_without_gap(self, node, p, s, t): debug("Fixing topicalisation without gap: %s", pprint(node)) new_kid = copy(t) new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False) new_category = featureless(p.category) / featureless(s.category) replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))
def fix_subject_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N debug("%s", reduced) node = n debug("Fixing subject extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=node) if not self.relabel_relativiser(pred): # TOP is the shrunk VP # after shrinking, we can get VV or VA here # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7)) result = get_first( node, r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/', with_context=True, left_to_right=True) if not result: debug( 'Could not find verbal category; did not create null relativiser.' ) return top, context = result SS = context.ss.category debug("Creating null relativiser unary category: %s", SS / SS) replace_kid(top.parent, top, Node("NN", [top], SS / SS, head_index=0))
def fix_object_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N node = n debug("Fixing object extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) # If we couldn't find the DEC node, this is the null relativiser case if not self.relabel_relativiser(pred): # TOP is the S node # null relativiser category comes from sibling of TOP # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9)) result = get_first(top, r'* $ *=SS', with_context=True, nonrecursive=True) if result: _, ctx = result ss = ctx.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def fix_modification(self, node, p, s, t): debug("Fixing modification: %s", lrp_repr(node)) S, P = s.category, p.category # If you don't strip the tag :m from the newly created child (new_kid), # the fix_modification pattern will match infinitely when tgrep visits new_kid new_kid = copy(t) new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False) new_category = featureless(P) / featureless(S) debug("Creating category %s", new_category) replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))
def accept_derivation(self, bundle): for node, ctx in find_all(bundle.derivation, expr, with_context=True): u = ctx.n.lex.decode('u8') if u[0] in baixing: leaf = ctx.n kids = [ Leaf(leaf.tag, u[0].encode('u8'), None), Leaf(leaf.tag, u[1:].encode('u8'), None) ] replace_kid(ctx.n.parent, ctx.n, Node('NR', kids)) #node.kids = kids self.write_derivation(bundle)
def fix_rnr(self, rnr, g): # G is the node dominating all the conjuncts rnr_tags = [] for node, ctx in find_all(g, r'/:c/a', with_context=True): for rnr in find_all(node, r'^/\*RNR\*/'): rnr_tags.append(get_trace_index_from_tag(rnr.lex)) for index in rnr_tags: for node, ctx in find_all( g, r'*=PP < { *=P < { *=T < ^/\*RNR\*%s/ $ *=S } }' % index, with_context=True): inherit_tag(ctx.s, ctx.p) self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s) self.fix_categories_starting_from(ctx.s, g) # This breaks with the IP (LC CC LC) case in 9:19(11) -- last_conjunct returns None # because the last conjunct has been shrunk last_conjunct = list(find_first(g, r'/:c/a', left_to_right=False)) args = [] # Here, we uniquify the rnr tags so that we excise each shared argument only once for index in set(rnr_tags): # find_first, because we only want to find one match, the shallowest. # cf 7:27(10), if NP-OBJ-2(NN NP-OBJ-2(JJ NN)), then we only want to identify # one matching node for index -2 -- the shallowest -- and not two. for node, ctx in find_first(last_conjunct[0], r'*=P < { /%s/a=T $ *=S }' % index, with_context=True): args.append(ctx.t) # Note: last_conjunct may be disconnected from # the tree by replace_kid (when ctx.p == last_conjunct) replace_kid(ctx.p.parent, ctx.p, ctx.s) self.fix_categories_starting_from(ctx.s, g) # Because the find_all which retrieved the args is an in-order left-to-right traversal, it will find # shallower nodes before deeper nodes. Therefore, if a verb has two args V A1 A2, the _args_ list will # contain [A2, A1] because A2 is shallower (further from the head) than A1. # We reverse the list of args, so that args are re-attached from the inside out (starting from A1). # args.reverse() new_g = g for arg in args: new_g = Node(new_g.tag, [new_g, arg], new_g.category.left, head_index=0) arg.parent = new_g replace_kid(g.parent, g, new_g)
def fix_whword_topicalisation(self, node, p, s, t): debug('Fixing wh-word topicalisation: node: %s', lrp_repr(node)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid( p, t, Node(base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, SbNP, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) expr = r'*=PP < { /VP/=P < { /NP-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } }' % index for top, ctx in find_all(p, expr, with_context=True): replace_kid(ctx.pp, ctx.p, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def fix_topicalisation_with_gap(self, node, p, s, t): debug("Fixing topicalisation with gap:\nnode=%s\ns=%s\nt=%s", lrp_repr(node), pprint(s), pprint(t)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid( p, t, Node(base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, S, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) # attested gaps: # 575 IP-TPC:t # 134 NP-TPC:t # 10 IP-Q-TPC:t # 8 CP-TPC:t # 4 NP-PN-TPC:t # 2 QP-TPC:t # 2 NP-TTL-TPC:t # 1 PP-TPC:t # 1 IP-IJ-TPC:t # 1 INTJ-TPC:t # 1 CP-Q-TPC:t # 1 CP-CND-TPC:t expr = r'/IP/=TOP << { *=PP < { *=P < { /[NICQP]P-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } } }' % index for top, ctx in find_all(s, expr, with_context=True): debug('top: %s', pprint(top)) self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def fix_categories_starting_from(self, node, until): '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct CCG analysis.''' while node is not until: # Only fix binary rules if (not node.parent) or node.parent.count() < 2: break l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) debug("L: %s R: %s P: %s", L, R, P) applied_rule = analyse(L, R, P) debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R, ''.join(r.text()), P, ''.join(p.text()), applied_rule) if applied_rule is None: debug("invalid rule %s %s -> %s", L, R, P) if R.is_complex() and R.left.is_complex( ) and L == R.left.right: # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # conj R -> P # Make P into R[conj] # L cannot be the comma category (,), otherwise we get a mis-analysis # in 2:22(5) if str(L) in ('conj', 'LCM'): p.category = R.clone_adding_feature('conj') debug("New category: %s", p.category) # L R[conj] -> P elif R.has_feature('conj'): new_L = L.clone() r.category = new_L.clone_adding_feature('conj') p.category = new_L debug("New category: %s", new_L) elif L.is_leaf(): # , R -> P[conj] becomes , R -> R[conj] if P.has_feature('conj') and l.tag in ( 'PU', 'CC'): # treat as partial coordination debug("Fixing coordination: %s" % P) p.category = r.category.clone_adding_feature('conj') debug("new parent category: %s" % p.category) # , R -> P becomes , R -> R elif l.tag == "PU" and not P.has_feature( 'conj'): # treat as absorption debug("Fixing left absorption: %s" % P) p.category = r.category # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y elif R.is_complex() and R.left.is_complex( ) and L == R.left.right: T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif R.is_leaf(): # R , -> P becomes R , -> R if r.tag == "PU": # treat as absorption debug("Fixing right absorption: %s" % P) p.category = l.category # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) else: new_parent_category = None # try typeraising fix # T/(T/X) (T\A)/X -> T can be fixed: # (T\A)/((T\A)/X) (T\A)/X -> T\A if self.is_topicalisation(L) and (L.right.right == R.right and P == L.left and P == R.left.left): T_A = R.left X = R.right l.category = T_A / (T_A / X) new_parent_category = T_A # (X|X)|Z Y -> X becomes # (X|X)|Z X|(X|X) -> X|Z elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise( R, R, TR_BACKWARD, strip_features=False) #T/(T|L) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # Generalise over right modifiers of verbal categories (S[dcl]\X)$ elif self.is_verbal_category( L) and L.is_complex() and L.left.is_complex(): T = L.left.right new_category = typeraise(R, T, TR_BACKWARD) debug('Trying out %s', new_category) if bxcomp(L, new_category): node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) # Last ditch: try all of the composition rules to generalise over L R -> P if not new_parent_category: # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3)) # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category new_parent_category = ( fcomp(L, R) or bcomp(L, R, when=not self.is_relativiser(R)) or bxcomp( L, R, when=not self.is_relativiser(R) ) #or bxcomp2(L, R, when=self.is_verbal_category(L)) or fxcomp(L, R, when=not self.is_relativiser(R))) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category else: debug("couldn't fix, skipping") node = node.parent debug('')