def attach_quotes(self, deriv, span_begin, span_end, quote_type, higher, quotes): leaf_count = len(list(leaves(deriv))) first_index = 0 if (span_begin is None) else span_begin last_index = 0 if (span_end is None) else span_end begin_node = get_leaf(deriv, first_index, "forwards") end_node = get_leaf(deriv, last_index, "backwards") if end_node: end_node = self.punct_class.process_punct(deriv, end_node, span_end) lca_node = lca(begin_node, end_node) if lca_node: deriv = self.insert_quotes(deriv, lca_node, higher) quote_indices = [None, None] for index, leaf in enumerate(leaves(deriv)): if str(leaf.cat) == 'LQU': quote_indices[0] = index elif str(leaf.cat) == 'RQU': quote_indices[1] = index - 2 return deriv, quote_indices
def view_deriv(env, start_response): global node_index node_index = 0 start_response('200 OK', [('Content-type', 'text/html')]) variables = env['selector.vars'] doc_id, deriv_id = int(variables['doc']), int(variables['deriv']) filename = 'chtb_%04d.fid' % doc_id doc = GuessReader(os.path.join(CORPORA_PATH, filename)) if doc: bundle = doc[deriv_id] body = '' if bundle: body += '<div id="tree">' body += pprint(bundle.derivation, sep=' ', newline='<br/>', node_repr=html_node_repr) body += '</div>' body += '<div id="main">' for leaf, n in izip( leaves(bundle.derivation, lambda e: not is_ignored(e)), count()): body += '''<span class="word"><span id="word%(index)d" onmouseover="$('pos').show();$('pos%(index)s').show();$('tree%(index)s').addClassName('highlighted');" onmouseout="$('tree%(index)s').removeClassName('highlighted');$('pos%(index)s').hide();$('pos').hide();">%(body)s</span></span>''' % { 'index': n, 'body': leaf.lex } body += prev_next_links(doc, doc_id, deriv_id) body += '</div>' body += '<div id="pos">' body += '<span id="pos_display">' for leaf, n in izip( leaves(bundle.derivation, lambda e: not is_ignored(e)), count()): body += '<span id="pos%d" style="display:none">%s</span>' % ( n, leaf.tag) body += '</span>' body += '</div>' yield layout(body) else: yield error_document() else: yield error_document()
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if is_np_internal_structure(node): all_leaves = list(leaves(node)) node.kids = all_leaves self.write_derivation(bundle)
def accept_derivation(self, bundle): self.words += [ ( e.lex, str(e.cat), # e.tag ) for e in leaves(bundle.derivation) ]
def find_coindexed_trace(parent, trace_node): index = get_trace_index_from_tag(trace_node.tag) for kid in leaves(parent): match = IndexRegex.match(kid.lex) if match and match.group(1) == index[1:]: return kid return None
def naive_label_derivation(root): '''Applies the markedup labelling algorithm to each leaf under _root_.''' for leaf in leaves(root): leaf.cat = label(leaf.cat, lex=leaf.lex) # pre-populate the outermost slot with the lexical item leaf.cat.slot.head.lex = leaf.lex return root
def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)] leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append( (leaf.lex, index) ) elif (leaf.tag not in ("POS", ":") # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append( (span_begin, leaf_count-index, open_quote) ) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append( (None, leaf_count-index, quote_type) ) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append( (span_begin, None, remaining_quote) ) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result
def ccg2latex(root, glosses=None, abbreviate=False): def comb_symbol(comb): return arrows.get(comb, 'uline') def cat_repr(cat, i): cat_str = str(cat) if abbreviate is not False: if isinstance(abbreviate, xrange): if isinstance(i, int): if i in abbreviate: cat_str = abbr(cat_str) elif isinstance(i, xrange): if abbreviate.start <= i.start < i.end <= abbreviate.end: cat_str = abbr(cat_str) else: cat_str = abbr(cat_str) return sanitise_category(cat_str) out = ['\deriv{%d}{' % root.leaf_count()] all_leaves = list(leaves(root)) # lex line if glosses is not None: leaf_bits = ("\\glosN{%s}{%s}" % (leaf.lex, gloss) for (leaf, gloss) in izip(all_leaves, glosses)) else: leaf_bits = (("\\cjk{%s}" % leaf.lex) for leaf in all_leaves) out.append(' & '.join(leaf_bits) + '\\\\') # underlines line out.append( ' & '.join(["\uline{1}"] * root.leaf_count()) + '\\\\' ) # cats line out.append( (' & '.join(("\\cf{%s}"%cat_repr(leaf.cat, i) for i, leaf in enumerate(all_leaves)))) + '\\\\' ) rows = [] for l, r, p in pairs_postorder(root): rows.append( (min_leaf_id(p, root), p.cat, analyse(l.cat, r and r.cat, p.cat), p.leaf_count()) ) grouped_subrows = group(rows) for subrows in grouped_subrows: subline = [] subout = [] last_span = 0 # holds the index of the rightmost span in this row for leftmost_leaf_id, cat, comb, span in subrows: subline.append( "&"*(leftmost_leaf_id - last_span) + ("\%s{%s}" % (comb_symbol(comb), span)) ) subout.append( "&"*(leftmost_leaf_id - last_span) + ("\mc{%d}{%s}" % (span, cat_repr(cat, range(leftmost_leaf_id, leftmost_leaf_id+span)))) ) last_span = leftmost_leaf_id+span-1 # write out underlines line out.append(' '.join(subline) + '\\\\') # write out cats line out.append(' '.join(subout) + '\\\\') out.append('}') return '\n'.join(out)
def make_derivation(deriv, assigned_id=None, leaf_id=0): '''Generates the body of the DOT representation.''' if deriv.is_leaf(): if write_tree_indices: label = "%d %s" % (leaf_id, deriv.label_text()) else: label = deriv.label_text() return '''%s [shape="none",height=0.17,label="%s"]\n''' % (assigned_id, label) else: ret = [] root_id = assigned_id or get_id() for i, child in enumerate(deriv): child_id = get_id() if isinstance(deriv, (ccg.Leaf, ccg.Node)): comb_name = re.escape(Abbreviations.get(analyse(deriv.lch.cat, deriv.rch and deriv.rch.cat, deriv.cat), '')) if comb_name: shape_type = "record" label_text = "<o>%s|%s" % (deriv.label_text(), comb_name) else: shape_type = "box" label_text = deriv.label_text() ret.append('''%s [shape="%s",height=0.1,label="%s"]\n''' % (root_id, shape_type, label_text)) if config.highlight_head_arrows and i == int(deriv.head_index): ret.append("%s:o -> %s:o [color=red]\n" % (root_id, child_id)) else: ret.append("%s:o -> %s:o\n" % (root_id, child_id)) ret.append(make_derivation(child, child_id, leaf_id=leaf_id)) leaf_id += len(list(leaves(child))) else: ret.append('''%s [shape="box",height=0.1,label="%s"]\n''' % (root_id, deriv.label_text())) ret.append("%s -> %s\n" % (root_id, child_id)) ret.append(make_derivation(child, child_id, leaf_id=leaf_id)) leaf_id += len(list(leaves(child))) return ''.join(ret)
def accept_derivation(self, bundle): self.nderivs += 1 self.nwords += len(bundle.derivation.text()) for leaf in leaves(bundle.derivation): if self.is_trace(leaf): self.ecs += 1 self.ec_types[base_tag(leaf.lex)] += 1 else: self.tokens.add(leaf.lex)
def run_filters(self, filters, files): # If all given filters were not found or had wrong argument count, do nothing if not filters: return reader_args = {} if self.reader_class_name: try: reader_class = globals()[self.reader_class_name] info("Using reader class %s.", self.reader_class_name) reader_args['reader_class'] = reader_class except KeyError: raise RuntimeError("Reader class %s not found." % self.reader_class_name) for file in self.transform(files): if self.is_pair_spec(file): meta_reader = PairedReader else: meta_reader = DirFileGuessReader try: self.last_exceptions = [] for derivation_bundle in meta_reader(file, verbose=self.verbose, **reader_args): if self.verbose: info("Processing %s...", derivation_bundle.label()) try: for filter in filters: filter.context = derivation_bundle if filter.accept_leaf is not None: for leaf in leaves(derivation_bundle.derivation): for filter in filters: filter.accept_leaf(leaf) if filter.accept_comb_and_slash_index is not None: try: for slash_index, comb in enumerate(applications_per_slash(leaf)): filter.accept_comb_and_slash_index(leaf, comb, slash_index) except AttributeError: # TODO: hacky and inefficient, need this to work for PTB too pass for filter in filters: filter.accept_derivation(derivation_bundle) filter.context = None except IOError, e: # If output is going to a pager, and the user requests an interrupt (^C) # the filter fails with IOError: Broken pipe # In that case, running filters on further derivations will continue to # lead to 'Broken pipe', so just bail out if e.errno == errno.EPIPE: return except Exception, e: self.last_exceptions.append( (derivation_bundle, sys.exc_info()) ) if self._break_on_exception: raise FilterException(e, None)
def is_np_internal_structure(node): # rule out things already tagged explicitly as coordination by tag.py if any(has_tags(kid, 'cC') for kid in node): return False return (node.tag.startswith('NP') and all( has_tags(kid, 'nN') or any( kid.tag.startswith(tag) for tag in NominalCategories) or kid.tag in ('PU', 'CC') or kid.tag.startswith('JJ') or kid.tag.startswith('CD') or kid.tag.startswith('OD') or has_tag(kid, '&') for kid in leaves(node)))
def min_leaf_id(node, root): '''(Inefficiently) finds the leaf index of _node_ relative to _root_.''' cur = node while not cur.is_leaf(): cur = cur[0] # cur is the left corner of _node_ for leaf_id, leaf in enumerate(leaves(root)): if leaf is cur: return leaf_id
def is_np_internal_structure(node): # rule out things already tagged explicitly as coordination by tag.py if any(has_tags(kid, 'cC') for kid in node): return False return (node.tag.startswith('NP') and all(has_tags(kid, 'nN') or any(kid.tag.startswith(tag) for tag in NominalCategories) or kid.tag in ('PU', 'CC') or kid.tag.startswith('JJ') or kid.tag.startswith('CD') or kid.tag.startswith('OD') or has_tag(kid, '&') for kid in leaves(node)))
def Precedes(candidate, node, context): if not node.is_leaf(): return False root = get_root(node) node_index = get_index_of_leaf(root, node) for successor in islice(leaves(root), node_index+1): if candidate.is_satisfied_by(successor, context): return True return False
def accept_derivation(self, bundle): print bundle.label(), error_found = False for i, leaf in enumerate(leaves(bundle.derivation)): if i in self.indices: # check rules starting from this leaf for comb, (l, r, p) in combinators_and_path_from_node(leaf): if comb is None: error_found = True print i, rule_repr(l, r, p), if not error_found: print 'none',
def accept_derivation(self, bundle): global merge_verb_compounds if merge_verb_compounds: for node in nodes(bundle.derivation): if node.tag in self.MergedTags: replace_kid(node.parent, node, Leaf(node.tag, ''.join(kid.lex for kid in leaves(node)), node.parent)) if normalise_foreign_names: for leaf in leaves(bundle.derivation): if self.is_candidate_foreign_name(leaf.lex): kids = [ Leaf(leaf.tag, bit, None) for bit in leaf.lex.split(INTERPUNCT) ] replace_kid(leaf.parent, leaf, Node('NP-PN', kids)) if self.accept(bundle.derivation): self.write_derivation(bundle)
def attach_quotes(self, deriv, span_begin, span_end, quote_type, higher, quotes): '''Given a CCGbank derivation, a pair of indices denoting the span of quoted text, whether single or double quotes are to be inserted, and quoting parameters, this does the insertion and returns a tuple (D, (b, e)), where D is the new derivation (the root may have been changed through quote attachment) and the indices at which the quotes have been inserted. Either b or e may be None to indicate that no opening or closing quote was inserted.''' do_left = quotes in ("both", "left") do_right = quotes in ("both", "right") first_index = 0 if (span_begin is None) else span_begin last_index = 0 if (span_end is None) else span_end leaf_count = len(list(leaves(deriv))) quoted_text = list(text_in_span(deriv, first_index, (leaf_count - last_index))) if (first_index is not None) or (last_index is not None): if higher == "left": if do_right: deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_end, quote="end", quote_type=quote_type) if do_left: deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_begin, quote="begin", quote_type=quote_type) elif higher == "right": if do_left: deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_begin, quote="begin", quote_type=quote_type) if do_right: deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_end, quote="end", quote_type=quote_type) quote_indices = [] if (span_begin is not None) and do_left: quote_indices.append(span_begin) else: quote_indices.append( None ) if (span_end is not None) and do_right: quote_indices.append(leaf_count - span_end - 1) else: quote_indices.append( None ) return deriv, quote_indices
def mkdeps(root, postprocessor=identity): for i, leaf in enumerate(leaves(root)): # Uniquify each leaf with an index leaf.lex += IndexSeparatorTemplate % i # Apply the left to right slash labelling # (we abuse this to refer to slots, not slashes) leaf.cat.parg_labelled() # Populate the outermost (_) variable of each leaf leaf.cat.slot.head.lex = leaf.lex for (l, r, p) in pairs_postorder(root): _label_result(l, r, p) global unanalysed unaries = [] for l, r, p in pairs_postorder(root): L, R, P = map(lambda x: x and x.cat, (l, r, p)) comb = analyse(L, R, P) if not comb: debug("Unrecognised rule %s %s -> %s", L, R, P) unifier = [] if config.debug: debug("%s %s %s (%s)", L, R, P, str(comb)) if comb == 'fwd_appl': # [Xx/Yy]l Yy -> Xx unifier = unify(L.right, R) p.cat = L.left elif comb == 'bwd_appl': # Yy [Xx\Yy]r -> Xx unifier = unify(L, R.right) p.cat = R.left # Pro-drops which drop their outer argument # [(S_\NPy)_/NPx]_ -> [S_\NPy]_ elif comb in ('object_prodrop', 'vp_vp_object_prodrop', 'yi_subject_prodrop', 'vp_modifier_subject_prodrop'): p.cat = L.left # [Xx/Yy]l [Yy/Zz]r -> [Xx/Zz]r elif comb == 'fwd_comp': # X/Y Y/Z -> X/Z if is_rooted_in(Sdcl, L, respecting_features=True): P.slot = L.slot else: P.slot = R.slot # lexical head comes from R (Y/Z) P.slot.var = fresh_var(prefix='K') unifier = unify(L.right, R.left) p.cat._left = L.left p.cat._right = R.right # [Yy\Zz]l [Xx\Yy]r -> [Xx\Zz]l elif comb == 'bwd_comp': # Y\Z X\Y -> X\Z if is_rooted_in(Sdcl, R, respecting_features=True): P.slot = R.slot else: P.slot = L.slot # lexical head comes from L (Y\Z) P.slot.var = fresh_var(prefix='K') unifier = unify(R.right, L.left) p.cat._left = R.left p.cat._right = L.right elif comb in ('s_np_apposition', 'vp_np_apposition'): # { S[dcl], S[dcl]\NP } NPy -> NPy P.slot = R.slot # = copy_vars unifier = unify(P, R) # NP NP -> N/N elif comb == 'np_np_to_nfn_apposition': # do the same as NP NP -> NP, except fill in the vars Ny/Ny P.right.slot.var = fresh_var(prefix='N') P.left.slot = P.right.slot register_unary(unaries, p, L.slot.head.lex) make_set_head_from(l, r, p) elif comb in ('conjoin', 'np_np_apposition'): # X X[conj] -> X make_set_head_from(l, r, p) elif comb in ('conj_absorb', 'conj_comma_absorb'): # conj X -> X[conj] copy_vars(frm=R, to=P) unify(P, R) # R.slot.head = P.slot.head elif comb == 'funny_conj': # conj X -> X p.cat = R elif comb == 'nongap_topicalisation': # {N, NP, S[dcl], QP}x -> [Sy/Sy]x P.slot = L.slot P.right.slot.var = fresh_var() P.left.slot = P.right.slot register_unary(unaries, p, L.slot.head.lex) elif comb in ('np_gap_topicalisation', 's_gap_topicalisation', 'qp_gap_topicalisation'): # NPx -> [ Sy/(Sy/NPx)y ]y P.right.right.slot = L.slot P.slot.var = fresh_var() P.left.slot = P.right.left.slot = P.right.slot = P.slot elif comb == 'subject_prodrop': # (S[dcl]y\NPx)y -> S[dcl]y | [(S[dcl]y\NPx)y/NPz]y -> (S[dcl]y/NPz)y if P == parse_category(r'S[dcl]'): P.slot = L.slot elif P == parse_category(r'S[dcl]/NP'): P.slot = P.left.slot = L.slot P.right.slot = L.right.slot else: warn("Invalid parent category %s for subject prodrop.", P) elif comb == 'fwd_xcomp': # [Xx/Yy]l [Yy\Zz]r -> [Xx/Zz]r if is_rooted_in(Sdcl, L, respecting_features=True): P.slot = L.slot else: P.slot = R.slot # lexical head comes from R (Y/Z) P.slot.var = fresh_var(prefix='K') unifier = unify(L.right, R.left) p.cat._left = L.left p.cat._right = R.right elif comb == 'bwd_xcomp': # [Yy/Zz]l [Xx\Yy]r -> [Xx/Zz]l if is_rooted_in(Sdcl, R, respecting_features=True): P.slot = R.slot else: P.slot = L.slot # lexical head comes from L (Y\Z) # P.slot = L.slot P.slot.var = fresh_var(prefix='K') unifier = unify(R.right, L.left) p.cat._left = R.left p.cat._right = L.right elif comb == 'bwd_r1xcomp': # [(Yy/Zz)k/Ww]l [Xx\Yy]r -> [(Xx\Zz)k/Ww]l # TODO: where should P's lexical head come from? L or R? unifier = unify(L.left.left, R.right) p.cat._left._left = R.left p.cat._left._right = L.left.right p.cat._right = L.right elif comb in ('fwd_raise', 'bwd_raise'): # Xx -> [ Tf|(Tf|Xx)f ]f if P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/(S[dcl]\NP))'): # (S[dcl]y\NPz)y -> [ (S[dcl]f\NPg)f/((S[dcl]f\NPg)f\(S[dcl]y\NPz)y)f ]f P.left.slot.var = P.left.left.slot.var = P.right.slot.var = P.slot.var = fresh_var() # f P.left.right.slot.var = fresh_var() # g copy_vars(frm=P.left, to=P.right.left) copy_vars(frm=L, to=P.right.right) unifier = unify(L, P.right.right) elif P == parse_category(r'((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP)'): # NPy -> [ ((S[dcl]v\NPw)v/QPz)v \ ( ((S[dcl]v\NPw)v/QPz)v/NPy )v ]v P.slot.var = fresh_var() P.left.slot = P.right.slot = \ P.left. left.slot = P.left. left.left.slot = \ P.right.left.slot = P.right.left.left.slot = \ P.right.left.left.left.slot = P.slot # v # P.right.right.slot = fresh_var() # y P.right.right.slot = L.slot P.left.right.slot.var = fresh_var('Z') P.right.left.right.slot = P.left.right.slot # z P.left.left.right.slot.var = fresh_var('W') P.right.left.left.right.slot = P.left.left.right.slot # w unifier = unify(L, P.right.right) elif P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/QP)'): # QPy -> [ (S[dcl]v\NPz)v \ ((S[dcl]v\NPz)v/QPy)v ]v P.slot.var = fresh_var() P.left.slot = P.left.left.slot = \ P.right.slot = P.right.left.slot = P.right.left.left.slot = P.slot # v # P.right.right.slot = fresh_var() # y P.right.right.slot = L.slot P.left.right.slot.var = fresh_var('Z') P.right.left.right.slot = P.left.right.slot # z unifier = unify(L, P.right.right) else: P.slot.var = fresh_var() P.right.left.slot = P.left.slot = P.right.slot = P.slot P.right.right.slot = L.slot unifier = unify(L, P.right.right) elif comb == 'np_typechange': P.slot = L.slot # = copy_vars unifier = unify(P, L) elif comb == 'lcp_np_typechange': P.slot = L.slot unifier = unify(P, L) elif comb in ('lcp_sfs_typechange', 'lcp_nfn_typechange'): P.left.slot.var = fresh_var() P.right.slot = P.left.slot P.slot = L.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'lcp_sbnpfsbnp_typechange': # [(Sy\NPz)y/(Sy\NPz)y]_ P.left.slot.var = fresh_var() P.left.left.slot = P.right.left.slot = P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'null_relativiser_typechange': # Xy -> (Nf/Nf)y P.slot = L.slot if P == _NfN: P.left.slot.var = fresh_var() P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif P == _NfNfNfN: P.left.slot.var = fresh_var() P.left.left.slot.var = fresh_var(prefix="G") P.left.right.slot = P.left.left.slot P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) else: warn("Unhandled null relativiser typechange: %s -> %s", L, P) # [NP/NP]y -> NPy elif comb == 'de_nominalisation': P.slot = L.slot register_unary(unaries, p, L.slot.head.lex) # {M, QP}y -> (Nf/Nf)y elif comb == 'measure_word_number_elision': P.slot = L.slot P.left.slot.var = fresh_var() P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'l_punct_absorb': # , X -> X[conj] # need to put conj feature back on parent p.cat = R.clone_adding_feature('conj') elif comb == 'r_punct_absorb': p.cat = L elif R and L == R and is_rooted_in(parse_category('S'), L): # VCD (stopgap) make_set_head_from(l, r, p) else: debug('Unhandled combinator %s (%s %s -> %s)', comb, L, R, P) unanalysed.add(comb) P.slot = R.slot if R else L.slot for (dest, src) in unifier: if isinstance(src, (basestring, list)): # Fake bidirectional unification: # ------------------------------- # If variable X has been unified with value v, # rewrite all mentions of v in the output category to point to variable X # (v is uniquified by concatenating it with an ID, so this should hold) for subcat in p.cat.nested_compound_categories(): if subcat.slot.head.lex == src: subcat.slot = dest.slot if config.debug: debug("> %s" % p.cat) debug('---') if config.fail_on_unassigned_variables: assert no_unassigned_variables(p.cat), "Unassigned variables in %s" % p.cat if config.debug: debug('unaries: %s', unaries) # Collect deps from arguments deps = [] for l in chain( leaves(root), unaries ): if config.debug: debug("%s %s", l, l.cat) C = l.cat while not C.is_leaf(): arg = C.right if arg.slot.head.filler: #and not l.cat.left.slot == l.cat.right.slot): # print "%s %s %s %s %s %s" % (C.slot.head.lex, C, arg.slot.head.lex, arg, l.cat, C.label) if C.label is None: warn("Dependency generated on slash without label: %s %s", C, arg) deps.append( (C.slot.head.lex, arg.slot.head.lex, l.cat, C.label) ) if is_modifier(C): break C = C.left # Produce dep pairs result = set() for depl, depr, head_cat, head_label in deps: for sdepl in set(seqify(depl)): for sdepr in set(seqify(depr)): if not (sdepl and sdepr): debug("Dependency with None: %s %s", sdepl, sdepr) continue result.add( (postprocessor(sdepl), postprocessor(sdepr), head_cat, head_label) ) if config.debug: for line in write_deps(result): debug(line) return result
def is_np_internal_structure(node): return node.tag.startswith('NP') and node.count() > 1 and ( all(kid.tag in ValidNPInternalTags for kid in leaves(node)))
def caption_nwords(bundle): sys.stdout.write(str(len(list(leaves(bundle.derivation)))))
def is_np_internal_structure(node): return node.tag.startswith('NP') and node.count() > 1 and (all( kid.tag in ValidNPInternalTags for kid in leaves(node)))
def run_filters(self, filters, files): # If all given filters were not found or had wrong argument count, do nothing if not filters: return reader_args = {} if self.reader_class_name: try: reader_class = globals()[self.reader_class_name] info("Using reader class %s.", self.reader_class_name) reader_args['reader_class'] = reader_class except KeyError: raise RuntimeError("Reader class %s not found." % self.reader_class_name) for file in self.transform(files): if self.is_pair_spec(file): meta_reader = PairedReader else: meta_reader = DirFileGuessReader try: self.last_exceptions = [] for derivation_bundle in meta_reader(file, verbose=self.verbose, **reader_args): if self.verbose: info("Processing %s...", derivation_bundle.label()) try: for filter in filters: filter.context = derivation_bundle if filter.accept_leaf is not None: for leaf in leaves(derivation_bundle.derivation): for filter in filters: filter.accept_leaf(leaf) if filter.accept_comb_and_slash_index is not None: try: for slash_index, comb in enumerate( applications_per_slash( leaf)): filter.accept_comb_and_slash_index( leaf, comb, slash_index) except AttributeError: # TODO: hacky and inefficient, need this to work for PTB too pass for filter in filters: filter.accept_derivation(derivation_bundle) filter.context = None except IOError, e: # If output is going to a pager, and the user requests an interrupt (^C) # the filter fails with IOError: Broken pipe # In that case, running filters on further derivations will continue to # lead to 'Broken pipe', so just bail out if e.errno == errno.EPIPE: return except Exception, e: self.last_exceptions.append( (derivation_bundle, sys.exc_info())) if self._break_on_exception: raise FilterException(e, None)
def transformer(self, bundle): return " ".join(self.format(leaf) for leaf in leaves(bundle.derivation))
def get_remapper_for(deriv_id): filespec = deriv_id_to_filespec(deriv_id, with_section_dir=False) reader = GuessReader( os.path.join('cn', filespec) ) bundle = iter(reader).next() root = bundle.derivation return remapper(leaves(root))
def leaf_count(self): return len(list(leaves(self)))
def transformer(self, bundle): return " ".join( self.format(leaf) for leaf in leaves(bundle.derivation))
def write_parg(bundle, deps): bits = ['<s id="%s"> %d' % (bundle.label(), len(list(leaves(bundle.derivation))))] bits += write_deps(deps) bits.append('<\s>') return '\n'.join(bits)
def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [ leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False) ] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [ leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True) ] leaf_count = len( leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append((leaf.lex, index)) elif (leaf.tag not in ( "POS", ":" ) # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append((span_begin, leaf_count - index, open_quote)) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append((None, leaf_count - index, quote_type)) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append((span_begin, None, remaining_quote)) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result
import psyco psyco.full() except ImportError: pass from munge.ccg.parse import * file = "final/%s" % sys.argv[1] t=naive_label_derivation(parse_tree(open(file).readlines()[2*int(sys.argv[2])+1])) print t print "sent:" print "-----" print ' '.join(t.text()) deps = mkdeps(t) print "deps:" print "-----" for l, r in deps: print "%s|%s" % (l, r) print "leaves:" print "-------" for leaf in leaves(t): print leaf.lex, leaf.cat print "unhandled combs:" print "----------------" for comb in unanalysed: print comb print "finished:" print pprint(t)