def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher, quotes, quoter): '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.''' with file(ccg_auto_out, 'w') as ccg_out: with file(ccg_parg_out, 'w') as parg_out: penn_trees = list(PTBReader(ptb_file)) ccg_trees = list(CCGbankReader(ccg_file)) deps = list(CCGbankDepsReader(deps_file)) matched_penn_trees = match_trees(penn_trees, ccg_trees) for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees, ccg_trees, deps): ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation quote_spans = spans(ptb_tree) while quote_spans: value = quote_spans.pop(0) span_start, span_end, quote_type = value if span_start is None and span_end is None: continue info("Reinstating quotes to %s (%s, %s)", ccg_bundle.label(), span_start, span_end) ccg_tree, quote_indices = quoter.attach_quotes( ccg_tree, span_start, span_end, quote_type, higher, quotes) # In case a new root has been installed, re-assign the new root to the CCGbank bundle ccg_bundle.derivation = ccg_tree # Shift remaining quote span indices by the number of quotes that have been inserted quote_spans = fix_quote_spans(quote_spans, quote_indices) dep = fix_dependencies(dep, quote_indices) print >> parg_out, dep print >> ccg_out, ccg_bundle
unrecognised_rules = defaultdict(lambda: 0) total, with_unrecognised_rules = 0, 0 ucp_rules = defaultdict(lambda: 0) with_ucp = 0 unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0) def is_ucp(l, r, p): if r is None: return False return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r for file in glob(sys.argv[1]): for bundle in CCGbankReader(file): has_unrecognised_rules, has_ucp = False, False for node in nodes(bundle.derivation): if node.is_leaf(): continue lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node)) comb = analyse(*lrp) l, r, p = lrp rule_tuple = (str(l), str(r), str(p)) if comb: combs[comb] += 1 elif is_ucp(*lrp):
def load_ccgbank_tree(fn, deriv_no): for i, doc in enumerate(CCGbankReader(fn)): if i == deriv_no: return doc.derivation return None