def fix_cat_for(self, leaf, slash_index, mode): key_category = re.sub(r'[-.*@]', '', str(leaf.cat)) if not (key_category in self.permitted_cats): warn("No entry in splitdef file for category %s", leaf.cat) return alternatives = self.permitted_cats[key_category] #print "All alternatives: %s" % alternatives old_modes = self.modes_for_cat(leaf.cat) def is_invalid_alternative(alt): alt_modes = self.modes_for_cat(alt) if len(alt_modes) != len(old_modes): warn("Replacement category %s has different size to original category %s", alt, leaf.cat) modes_for_comparison = zip(alt_modes, old_modes) del modes_for_comparison[slash_index] return str(leaf.cat) == str(alt) or \ any((ModeTier[alt] < ModeTier[old]) for (alt, old) in modes_for_comparison) valids = list(reject(alternatives, is_invalid_alternative)) if not valids: warn("No valid alternative for %s which preserves mode `%s' on slash %d", leaf.cat, mode, slash_index) return #print "Alternatives: %s" % valids alternative = min(valids, key=lambda e: self.permissiveness(e, slash_index)) debug("%s `%s' -> %s", leaf.cat, leaf.lex, alternative) leaf.cat = alternative
def run(self, filters_to_run, files): '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.''' filters = [] for filter_name, args in filters_to_run: # For a no-args switch, optparse passes in None; we substitute an empty tuple for # consistency if not args: args = () try: filter_class = self.available_filters_dict[filter_name] actual, expected = len(args), get_argcount_for_method(filter_class.__init__) if actual != expected: warn("Skipping filter %s; %d arguments given, %d expected.", filter_name, actual, expected) continue filters.append(filter_class(*args)) except KeyError: err("No filter with name `%s' found.", filter_name) # convert short notation in file specifiers to proper paths def expand_short_notation(fn): # short notation is # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn) if m: corpus_dir, sec, doc, deriv = m.groups() return os.path.join(corpus_dir, 'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv))) return fn files = [expand_short_notation(file) for file in files] self.run_filters(filters, files)
def determine_sec_and_doc(self, filename): '''Determines the section and document number given a filename of the form ``wsj_SSDD.mrg".''' matches = self.SecDocRegex.match(os.path.basename(filename)) if matches and len(matches.groups()) == 2: return (int(i) for i in matches.groups()) else: warn("Skipping malformed section/document specifier: `%s'", filename) return 0, 0
def is_satisfied_by(self, node, context): try: # Determine whether rhs matches the candidate node return self.op_func(self.rhs, node, context) except KeyError: warn("Invalid operator %s encountered.", self.operator) return False
def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)] leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append( (leaf.lex, index) ) elif (leaf.tag not in ("POS", ":") # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append( (span_begin, leaf_count-index, open_quote) ) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append( (None, leaf_count-index, quote_type) ) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append( (span_begin, None, remaining_quote) ) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result
def is_invalid_alternative(alt): alt_modes = self.modes_for_cat(alt) if len(alt_modes) != len(old_modes): warn("Replacement category %s has different size to original category %s", alt, leaf.cat) modes_for_comparison = zip(alt_modes, old_modes) del modes_for_comparison[slash_index] return str(leaf.cat) == str(alt) or \ any((ModeTier[alt] < ModeTier[old]) for (alt, old) in modes_for_comparison)
def load_requested_packages(module_names): '''Tries to load each module named in _module_names_, returning an array of the loadable module objects found in that module.''' loaded_modules = [] for module in module_names: try: # Suppose we want to import A.B.C. When fromlist is any value but [], it returns A.B.C. # Otherwise, it only returns the topmost module, A. loaded_modules.append( __import__(module, fromlist=[module]) ) except ImportError, e: warn("Couldn't import module %s (%s)", module, e)
def determine_reader(self, preview): '''Applies each of the guessers to the document, returning the corresponding reader class if a guesser matches.''' for guesser in self.guessers: if guesser.identify(preview): return guesser.reader_class() else: warn("determine_reader: No reader could be guessed given context ``%s''; assuming %s", preview, guesser.reader_class()) return self.default.reader_class()
def relabel_relativiser(self, node): # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP) result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True) if result is not None: _, context = result s, relativiser = context.s, context.rel relativiser.category = relativiser.category.clone_with(right=s.category) debug("New rel category: %s", relativiser.category) return True else: warn("Couldn't find relativiser under %s", node) return False
def relabel_relativiser(self, node): # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP) result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True) if result is not None: _, context = result s, relativiser = context.s, context.rel relativiser.category = relativiser.category.clone_with( right=s.category) debug("New rel category: %s", relativiser.category) return True else: warn("Couldn't find relativiser under %s", node) return False
def get_available_filters_dict(loaded_modules): '''Given a list of module objects, returns a dictionary mapping from filter names to valid filter objects found in those modules' namespaces.''' filters_found = {} for module in loaded_modules: for symbol_name in dir(module): obj = getattr(module, symbol_name) # Only consider classes which are strict subclasses of Filter if (type(obj) is TypeType and issubclass(obj, munge.proc.filter.Filter) and not obj.is_abstract() and obj is not munge.proc.filter.Filter): if symbol_name in filters_found: warn("An already loaded filter with the name %s has been overwritten by a filter with the same name.", symbol_name) filters_found[symbol_name] = obj return filters_found
def match_trees(penn_trees, ccg_trees): '''Given two lists, of PTB and CCGbank trees which we believe to belong to the same document file, this removes those PTB trees which do not correspond to any CCGbank tree. We assume that the given CCGbank derivations are a subsequence of the given PTB derivations.''' cur_ptb_index = 0 result = [] for ccg_bundle in ccg_trees: ccg_tree_matched = False while not ccg_tree_matched: if cur_ptb_index >= len(penn_trees): break ccg_text = ccg_bundle.derivation.text() # We want to compare the CCG text against the PTB text stripped of quotes ptb_text = penn_trees[cur_ptb_index].derivation.text( with_quotes=False) if ptb_text != ccg_text: warn("In document %s:", ccg_bundle.label()) warn("\tCCG tokens: %s", ' '.join(ccg_text)) warn("\tPTB tokens: %s", ' '.join(ptb_text)) else: result.append(penn_trees[cur_ptb_index]) ccg_tree_matched = True cur_ptb_index += 1 return result
def match_trees(penn_trees, ccg_trees): '''Given two lists, of PTB and CCGbank trees which we believe to belong to the same document file, this removes those PTB trees which do not correspond to any CCGbank tree. We assume that the given CCGbank derivations are a subsequence of the given PTB derivations.''' cur_ptb_index = 0 result = [] for ccg_bundle in ccg_trees: ccg_tree_matched = False while not ccg_tree_matched: if cur_ptb_index >= len(penn_trees): break ccg_text = ccg_bundle.derivation.text() # We want to compare the CCG text against the PTB text stripped of quotes ptb_text = penn_trees[cur_ptb_index].derivation.text(with_quotes=False) if ptb_text != ccg_text: warn("In document %s:", ccg_bundle.label()) warn("\tCCG tokens: %s", ' '.join(ccg_text)) warn("\tPTB tokens: %s", ' '.join(ptb_text)) else: result.append( penn_trees[cur_ptb_index] ) ccg_tree_matched = True cur_ptb_index += 1 return result
def __iter__(self): path, index = padded_rsplit(self.path, ':', 1) if not os.path.exists(path): # TODO: This doesn't skip the current file (can we do that from inside the iterator?) warn("%s does not exist, so skipping.", path) if self.reader_class: reader_arg = { 'reader': self.reader_class } else: reader_arg = {} if os.path.isdir(path): reader = MultiGuessReader(path, verbose=self.verbose, **reader_arg) else: if self.reader_class: reader = self.reader_class(self.path) else: reader = GuessReader(self.path) for deriv_bundle in reader: yield deriv_bundle
def run(self, filters_to_run, files): '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.''' filters = [] for filter_name, args in filters_to_run: # For a no-args switch, optparse passes in None; we substitute an empty tuple for # consistency if not args: args = () try: filter_class = self.available_filters_dict[filter_name] actual, expected = len(args), get_argcount_for_method( filter_class.__init__) if actual != expected: warn( "Skipping filter %s; %d arguments given, %d expected.", filter_name, actual, expected) continue filters.append(filter_class(*args)) except KeyError: err("No filter with name `%s' found.", filter_name) # convert short notation in file specifiers to proper paths def expand_short_notation(fn): # short notation is # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn) if m: corpus_dir, sec, doc, deriv = m.groups() return os.path.join( corpus_dir, 'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv))) return fn files = [expand_short_notation(file) for file in files] self.run_filters(filters, files)
def main(argv): parser = OptionParser() register_builtin_switches(parser) opts, args = parser.parse_args(argv) if not all_required_args_present(opts): parser.print_help() sys.exit(1) quoter_class = { 'span': SpanQuoter, 'lca' : LCAQuoter }[opts.quote_method] punct_class = { 'swap' : SwapComma, 'shift': ShiftComma }.get(opts.punct_method, None) quoter = quoter_class(punct_class) remaining_args = args[1:] if not remaining_args: # If no sec/doc specifiers are given, assume 'all sections all documents' remaining_args.append(':') ptb_files_spec = parse_requested_derivs(remaining_args) for sec_glob, doc_glob in ptb_files_spec: for ptb_file in glob(os.path.join(opts.penn_in, sec_glob, "wsj_%s%s.mrg" % (sec_glob, doc_glob))): info("Processing %s", ptb_file) matches = PTBFileRegex.search(ptb_file) if matches and len(matches.groups()) == 2: sec, doc = matches.groups() ccg_file = os.path.join(opts.ccg_in, 'AUTO', sec, "wsj_%s%s.auto" % (sec, doc)) deps_file = os.path.join(opts.ccg_in, 'PARG', sec, "wsj_%s%s.parg" % (sec, doc)) if not opts.quiet: if not os.path.exists(ccg_file): warn("No corresponding CCGbank file %s for Penn file %s", ccg_file, ptb_file) if not os.path.exists(deps_file): warn("No corresponding CCGbank dependency file %s for CCG file %s", deps_file, ccg_file) ccg_auto_dir, ccg_parg_dir = [os.path.join(opts.outdir, part, sec) for part in ('AUTO', 'PARG')] if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir) if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir) ccg_auto_out, ccg_parg_out = (os.path.join(ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)), os.path.join(ccg_parg_dir, 'wsj_%s%s.parg' % (sec, doc))) process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, opts.higher, opts.quotes, quoter) else: warn("Could not find, so ignoring %s", ptb_file)
def _label_node(node, inside_np_internal_structure=False, do_shrink=True): # NP<NN shrinking happens unconditionally if use_bare_N is false do_np_shrink = (not use_bare_N) or inside_np_internal_structure if node.is_leaf(): return node elif node.count() == 1: node.head_index = 0 # shrinkage rules (NP < NN shrinks to NN) if (do_shrink and ((do_np_shrink and ((node.tag.startswith('NP') and not has_tag(node, 'A') and has_noun_tag(node[0])) or node[0].tag == 'AD')) or (node.tag.startswith('VP') or is_verb_compound(node)) and # a handful of VRDs project a single child (11:29(4)) (has_verbal_tag(node[0]) or matches(node[0], 'VPT', 'VSB', 'VRD', 'VCD', 'VNV', 'AD', 'PP', 'QP', 'LCP', 'NP')) or (node.tag.startswith('ADJP') and matches(node[0], 'JJ', 'AD', 'NN', 'OD')) # bad tagging 25:40(5), 31:37(6) ) or (node.tag.startswith('ADVP') and exactly_matches(node[0], 'AD', 'CS', 'NN')) or (matches(node, 'NP-MNR', 'NP-PRP') and has_noun_tag(node[0])) or # 8:1(5) (node.tag.startswith('NP-PN') and node.tag.endswith(':a') and exactly_matches(node[0], 'NR')) or (node.tag.startswith('CLP') and exactly_matches(node[0], 'M')) or (node.tag.startswith('LCP') and exactly_matches(node[0], 'LC')) or # DT < OD found in 6:25(11) (node.tag.startswith('DP') and exactly_matches(node[0], 'DT', 'OD')) or # QP < AD in 24:68(8) (node.tag.startswith('QP') and matches(node[0], 'QP', 'M')) or # see head-initial case in tag.py (hack for unary PP < P) (node.tag.startswith('PP') and exactly_matches(node[0], 'P')) or # see bad tagging (WHNP CP DEC) in tag.py head-final case (node.tag.startswith('CP') and matches(node[0], 'IP')) or (node.tag.startswith('INTJ') and exactly_matches(node[0], 'IJ')) or (node.tag.startswith('LST') and exactly_matches(node[0], 'OD', 'CD')) or # the below is to fix a tagging error in 10:49(69) (node.tag.startswith('PRN') and exactly_matches(node[0], 'PU')) or # 0:15(5) LST < PU (node.tag.startswith('LST') and exactly_matches(node[0], 'PU')) or # unary DNP < QP in e.g. NP(DNP(QP(sanshi sui)) gongren) (5:51(6)) is meant to # suggest implicit 'de' but this causes the spurious QP -> N/N rule (node.tag.startswith('DNP') and matches(node[0], 'QP')) or # includes any tags of the form NP-X-PRD (see 10:67(32)) # but excludes VP(VC NP-PRD), which we want to analyse with VC |- (S[dcl]\NP)/NP ( node.tag.startswith('NP') and node.tag.find('-PRD') != -1 and has_noun_tag(node[0]) and not node.parent.kids[0].tag.startswith('VC') ) or matches(node, 'FLR') or matches(node, 'FW')): replacement = node[0] inherit_tag(replacement, node, strip_marker=True) replace_kid(node.parent, node, node[0]) return label_node(replacement) # NN for 25:61(7) elif (node.tag.startswith("QP") and exactly_matches(node[0], "OD", "CD", 'NN')): replacement = node[0] inherit_tag(replacement, node) replace_kid(node.parent, node, node[0]) #replacement.tag = node.tag return label_node(replacement) # promotion rules (NP < PN shrinks to NP (with PN's lexical item and pos tag)) # shrink NP-TMP < NT so that the NT lexical item gets the adjunct category elif ((node.tag.startswith('NP') and (exactly_matches(node[0], "PN") or matches(node[0], 'NT', 'DT'))) or # 21:2(6) (node.tag.startswith('ADVP') and exactly_matches(node[0], 'CC', 'PN')) or (node.tag.startswith('ADJP') and exactly_matches(node[0], 'PN', 'DT')) or # 28:82(8) (node.tag.startswith('DP') and matches(node[0], 'NN', 'PN')) or (matches(node, #'NP-PRD', 'NP-TTL-PRD', 'NP-PN-PRD', 'NP-LOC', 'NP-ADV', 'NP-PN-TMP', 'NP-PN-LOC', 'NP-TMP', 'NP-DIR', 'NP-PN-DIR') and has_noun_tag(node[0]))): replacement = node[0] inherit_tag(replacement, node) replace_kid(node.parent, node, node[0]) #replacement.tag = node.tag return label_node(replacement) # one child nodes else: node.kids[0] = label_node(node.kids[0]) return node elif is_S_NP_apposition(node): # When NP(IP-APP NP), shrinks the NP<NN so we can get a head-final # analysis. # Without the following check, fails on 5:95(17) where NP(IP-APP NN) # instead of the usual NP(IP-APP NP) # However, we don't want to shrink unless node[1] is actually a unary # projection (otherwise we'd delete leaves like in 0:89(16)) if not node[1].is_leaf() and node[1].count() == 1 and node[1][0].is_leaf(): inherit_tag(node[1][0], node[1]) node.kids[1] = node[1][0] return label_head_final(node) elif is_predication(node): return label_predication(node) elif is_prn(node): # although we want a head-initial analysis, we want a right-branching structure return label_adjunction(node, inside_np_internal_structure=True) elif is_apposition(node): return label_apposition(node, inside_np_internal_structure=True) elif is_np_structure(node):# and not node[0].tag.startswith('IP-APP'): return label_adjunction(node, inside_np_internal_structure=True) # TODO: misnomer elif is_np_internal_structure(node): return label_np_internal_structure(node) # 0:68(4) has both cases. If there are NP modifiers of a QP or an ADJP, we want them shrunk. elif node.kids[-1].tag in ('QP:h', 'ADJP:h'): return label_adjunction(node, inside_np_internal_structure=True) elif node.tag.startswith('VRD'): return label_head_initial(node) elif (is_adjunction(node) or is_verb_compound(node) or is_modification(node)): return label_adjunction(node) elif is_head_final(node): return label_head_final(node) elif is_head_initial(node): return label_head_initial(node) elif is_coordination(node) or is_ucp(node): return label_coordination(node, inside_np_internal_structure=True) else: warn("binarise: No known configuration for %s", node) return label_adjunction(node)
def t_error(t): warn("Illegal character `%s' encountered.", t.value[0]) t.lexer.skip(1)
def label(node, inside_np=False): ''' Labels the descendants of _node_ and returns _node_. ''' global BareN if node.category is None: node.category = ptb_to_cat(node) # if this matches the IP root with a *PRO* trace under it, then # we shouldn't map IP -> S, but rather IP -> S\NP if node.tag.startswith('NT'): # map NT -> NP, not N node.category = NP elif has_noun_tag(node): node.category = BareN else: node.category = ptb_to_cat(node) if node.is_leaf(): if not node.category: node.category = ptb_to_cat(node) return node # NP/NP (CP) -> NP elif is_cp_to_np_nominalisation(node): node[0].category = NPfNP node.kids[0] = label(node[0]) return node # VSB is analysed as head-final elif node.tag.startswith('VSB'): node[1].category = node.category node.kids[1] = label(node[1]) node[0].category = featureless(node.category) / featureless( node[1].category) node.kids[0] = label(node.kids[0]) return node # VCD is treated like apposition elif node.tag.startswith('VCD'): if has_verbal_tag(node[0]): node[0].category = node.category else: node[0].category = ptb_to_cat(node[0]) node.kids[0] = label(node[0]) if has_verbal_tag(node[1]): node[1].category = node.category else: node[1].category = ptb_to_cat(node[1]) node.kids[1] = label(node[1]) return node elif node.tag.startswith('VRD'): return label_right_adjunction(node) # must be above is_apposition, because there exist NP-APP:a ETC:& cases elif is_etc(node): return label_etc_head_final(node) elif is_S_NP_apposition(node): return rename_category_while_labelling_with( label_head_final, node, BareN if node.category == NP else node.category) elif (node.count() == 1 or is_topicalisation(node) or is_topicalisation_without_gap(node) or is_apposition(node) or is_argument_cluster(node) or is_modification(node)): node.kids[0] = label(node[0]) if node.count() > 1: node.kids[1] = label(node[1]) return node elif is_partial_ucp(node): return label_partial_coordination(node, ucp=True) elif is_ucp(node): return label_coordination(node, ucp=True) elif is_predication(node): return label_predication(node) elif is_left_absorption(node): return label_left_absorption(node) elif is_right_absorption(node): return label_right_absorption(node) elif is_right_adjunction(node): # (:h :a), for aspect particles return label_right_adjunction(node) elif is_partial_coordination(node): return label_partial_coordination(node) elif is_coordination(node): return label_coordination(node) elif is_np_structure(node): # and not node[0].tag.startswith('IP-APP'): return rename_category_while_labelling_with( label_np_structure, node, BareN, when=lambda category: category == NP) elif is_np_internal_structure(node): return label_np_internal_structure(node) elif is_punctuation_headed(node): return label_head_final(node) elif is_adjunction(node): return label_adjunction(node) elif is_head_final(node): return label_head_final(node) elif is_head_initial(node): return label_head_initial(node) else: warn( "Node did not match any known patterns -- assuming adjunction: %s", node.__repr__(suppress_lex=True)) return label_adjunction(node)
def label(node, inside_np=False): ''' Labels the descendants of _node_ and returns _node_. ''' global BareN if node.category is None: node.category = ptb_to_cat(node) # if this matches the IP root with a *PRO* trace under it, then # we shouldn't map IP -> S, but rather IP -> S\NP if node.tag.startswith('NT'): # map NT -> NP, not N node.category = NP elif has_noun_tag(node): node.category = BareN else: node.category = ptb_to_cat(node) if node.is_leaf(): if not node.category: node.category = ptb_to_cat(node) return node # NP/NP (CP) -> NP elif is_cp_to_np_nominalisation(node): node[0].category = NPfNP node.kids[0] = label(node[0]) return node # VSB is analysed as head-final elif node.tag.startswith('VSB'): node[1].category = node.category node.kids[1] = label(node[1]) node[0].category = featureless(node.category) / featureless(node[1].category) node.kids[0] = label(node.kids[0]) return node # VCD is treated like apposition elif node.tag.startswith('VCD'): if has_verbal_tag(node[0]): node[0].category = node.category else: node[0].category = ptb_to_cat(node[0]) node.kids[0] = label(node[0]) if has_verbal_tag(node[1]): node[1].category = node.category else: node[1].category = ptb_to_cat(node[1]) node.kids[1] = label(node[1]) return node elif node.tag.startswith('VRD'): return label_right_adjunction(node) # must be above is_apposition, because there exist NP-APP:a ETC:& cases elif is_etc(node): return label_etc_head_final(node) elif is_S_NP_apposition(node): return rename_category_while_labelling_with(label_head_final, node, BareN if node.category == NP else node.category) elif (node.count() == 1 or is_topicalisation(node) or is_topicalisation_without_gap(node) or is_apposition(node) or is_argument_cluster(node) or is_modification(node)): node.kids[0] = label(node[0]) if node.count() > 1: node.kids[1] = label(node[1]) return node elif is_partial_ucp(node): return label_partial_coordination(node, ucp=True) elif is_ucp(node): return label_coordination(node, ucp=True) elif is_predication(node): return label_predication(node) elif is_left_absorption(node): return label_left_absorption(node) elif is_right_absorption(node): return label_right_absorption(node) elif is_right_adjunction(node): # (:h :a), for aspect particles return label_right_adjunction(node) elif is_partial_coordination(node): return label_partial_coordination(node) elif is_coordination(node): return label_coordination(node) elif is_np_structure(node):# and not node[0].tag.startswith('IP-APP'): return rename_category_while_labelling_with( label_np_structure, node, BareN, when=lambda category: category == NP) elif is_np_internal_structure(node): return label_np_internal_structure(node) elif is_punctuation_headed(node): return label_head_final(node) elif is_adjunction(node): return label_adjunction(node) elif is_head_final(node): return label_head_final(node) elif is_head_initial(node): return label_head_initial(node) else: warn("Node did not match any known patterns -- assuming adjunction: %s", node.__repr__(suppress_lex=True)) return label_adjunction(node)
def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [ leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False) ] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [ leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True) ] leaf_count = len( leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append((leaf.lex, index)) elif (leaf.tag not in ( "POS", ":" ) # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append((span_begin, leaf_count - index, open_quote)) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append((None, leaf_count - index, quote_type)) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append((span_begin, None, remaining_quote)) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result
def main(argv): parser = OptionParser() register_builtin_switches(parser) opts, args = parser.parse_args(argv) if not all_required_args_present(opts): parser.print_help() sys.exit(1) quoter_class = {'span': SpanQuoter, 'lca': LCAQuoter}[opts.quote_method] punct_class = { 'swap': SwapComma, 'shift': ShiftComma }.get(opts.punct_method, None) quoter = quoter_class(punct_class) remaining_args = args[1:] if not remaining_args: # If no sec/doc specifiers are given, assume 'all sections all documents' remaining_args.append(':') ptb_files_spec = parse_requested_derivs(remaining_args) for sec_glob, doc_glob in ptb_files_spec: for ptb_file in glob( os.path.join(opts.penn_in, sec_glob, "wsj_%s%s.mrg" % (sec_glob, doc_glob))): info("Processing %s", ptb_file) matches = PTBFileRegex.search(ptb_file) if matches and len(matches.groups()) == 2: sec, doc = matches.groups() ccg_file = os.path.join(opts.ccg_in, 'AUTO', sec, "wsj_%s%s.auto" % (sec, doc)) deps_file = os.path.join(opts.ccg_in, 'PARG', sec, "wsj_%s%s.parg" % (sec, doc)) if not opts.quiet: if not os.path.exists(ccg_file): warn( "No corresponding CCGbank file %s for Penn file %s", ccg_file, ptb_file) if not os.path.exists(deps_file): warn( "No corresponding CCGbank dependency file %s for CCG file %s", deps_file, ccg_file) ccg_auto_dir, ccg_parg_dir = [ os.path.join(opts.outdir, part, sec) for part in ('AUTO', 'PARG') ] if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir) if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir) ccg_auto_out, ccg_parg_out = (os.path.join( ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)), os.path.join( ccg_parg_dir, 'wsj_%s%s.parg' % (sec, doc))) process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, opts.higher, opts.quotes, quoter) else: warn("Could not find, so ignoring %s", ptb_file)
def mkdeps(root, postprocessor=identity): for i, leaf in enumerate(leaves(root)): # Uniquify each leaf with an index leaf.lex += IndexSeparatorTemplate % i # Apply the left to right slash labelling # (we abuse this to refer to slots, not slashes) leaf.cat.parg_labelled() # Populate the outermost (_) variable of each leaf leaf.cat.slot.head.lex = leaf.lex for (l, r, p) in pairs_postorder(root): _label_result(l, r, p) global unanalysed unaries = [] for l, r, p in pairs_postorder(root): L, R, P = map(lambda x: x and x.cat, (l, r, p)) comb = analyse(L, R, P) if not comb: debug("Unrecognised rule %s %s -> %s", L, R, P) unifier = [] if config.debug: debug("%s %s %s (%s)", L, R, P, str(comb)) if comb == 'fwd_appl': # [Xx/Yy]l Yy -> Xx unifier = unify(L.right, R) p.cat = L.left elif comb == 'bwd_appl': # Yy [Xx\Yy]r -> Xx unifier = unify(L, R.right) p.cat = R.left # Pro-drops which drop their outer argument # [(S_\NPy)_/NPx]_ -> [S_\NPy]_ elif comb in ('object_prodrop', 'vp_vp_object_prodrop', 'yi_subject_prodrop', 'vp_modifier_subject_prodrop'): p.cat = L.left # [Xx/Yy]l [Yy/Zz]r -> [Xx/Zz]r elif comb == 'fwd_comp': # X/Y Y/Z -> X/Z if is_rooted_in(Sdcl, L, respecting_features=True): P.slot = L.slot else: P.slot = R.slot # lexical head comes from R (Y/Z) P.slot.var = fresh_var(prefix='K') unifier = unify(L.right, R.left) p.cat._left = L.left p.cat._right = R.right # [Yy\Zz]l [Xx\Yy]r -> [Xx\Zz]l elif comb == 'bwd_comp': # Y\Z X\Y -> X\Z if is_rooted_in(Sdcl, R, respecting_features=True): P.slot = R.slot else: P.slot = L.slot # lexical head comes from L (Y\Z) P.slot.var = fresh_var(prefix='K') unifier = unify(R.right, L.left) p.cat._left = R.left p.cat._right = L.right elif comb in ('s_np_apposition', 'vp_np_apposition'): # { S[dcl], S[dcl]\NP } NPy -> NPy P.slot = R.slot # = copy_vars unifier = unify(P, R) # NP NP -> N/N elif comb == 'np_np_to_nfn_apposition': # do the same as NP NP -> NP, except fill in the vars Ny/Ny P.right.slot.var = fresh_var(prefix='N') P.left.slot = P.right.slot register_unary(unaries, p, L.slot.head.lex) make_set_head_from(l, r, p) elif comb in ('conjoin', 'np_np_apposition'): # X X[conj] -> X make_set_head_from(l, r, p) elif comb in ('conj_absorb', 'conj_comma_absorb'): # conj X -> X[conj] copy_vars(frm=R, to=P) unify(P, R) # R.slot.head = P.slot.head elif comb == 'funny_conj': # conj X -> X p.cat = R elif comb == 'nongap_topicalisation': # {N, NP, S[dcl], QP}x -> [Sy/Sy]x P.slot = L.slot P.right.slot.var = fresh_var() P.left.slot = P.right.slot register_unary(unaries, p, L.slot.head.lex) elif comb in ('np_gap_topicalisation', 's_gap_topicalisation', 'qp_gap_topicalisation'): # NPx -> [ Sy/(Sy/NPx)y ]y P.right.right.slot = L.slot P.slot.var = fresh_var() P.left.slot = P.right.left.slot = P.right.slot = P.slot elif comb == 'subject_prodrop': # (S[dcl]y\NPx)y -> S[dcl]y | [(S[dcl]y\NPx)y/NPz]y -> (S[dcl]y/NPz)y if P == parse_category(r'S[dcl]'): P.slot = L.slot elif P == parse_category(r'S[dcl]/NP'): P.slot = P.left.slot = L.slot P.right.slot = L.right.slot else: warn("Invalid parent category %s for subject prodrop.", P) elif comb == 'fwd_xcomp': # [Xx/Yy]l [Yy\Zz]r -> [Xx/Zz]r if is_rooted_in(Sdcl, L, respecting_features=True): P.slot = L.slot else: P.slot = R.slot # lexical head comes from R (Y/Z) P.slot.var = fresh_var(prefix='K') unifier = unify(L.right, R.left) p.cat._left = L.left p.cat._right = R.right elif comb == 'bwd_xcomp': # [Yy/Zz]l [Xx\Yy]r -> [Xx/Zz]l if is_rooted_in(Sdcl, R, respecting_features=True): P.slot = R.slot else: P.slot = L.slot # lexical head comes from L (Y\Z) # P.slot = L.slot P.slot.var = fresh_var(prefix='K') unifier = unify(R.right, L.left) p.cat._left = R.left p.cat._right = L.right elif comb == 'bwd_r1xcomp': # [(Yy/Zz)k/Ww]l [Xx\Yy]r -> [(Xx\Zz)k/Ww]l # TODO: where should P's lexical head come from? L or R? unifier = unify(L.left.left, R.right) p.cat._left._left = R.left p.cat._left._right = L.left.right p.cat._right = L.right elif comb in ('fwd_raise', 'bwd_raise'): # Xx -> [ Tf|(Tf|Xx)f ]f if P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/(S[dcl]\NP))'): # (S[dcl]y\NPz)y -> [ (S[dcl]f\NPg)f/((S[dcl]f\NPg)f\(S[dcl]y\NPz)y)f ]f P.left.slot.var = P.left.left.slot.var = P.right.slot.var = P.slot.var = fresh_var() # f P.left.right.slot.var = fresh_var() # g copy_vars(frm=P.left, to=P.right.left) copy_vars(frm=L, to=P.right.right) unifier = unify(L, P.right.right) elif P == parse_category(r'((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP)'): # NPy -> [ ((S[dcl]v\NPw)v/QPz)v \ ( ((S[dcl]v\NPw)v/QPz)v/NPy )v ]v P.slot.var = fresh_var() P.left.slot = P.right.slot = \ P.left. left.slot = P.left. left.left.slot = \ P.right.left.slot = P.right.left.left.slot = \ P.right.left.left.left.slot = P.slot # v # P.right.right.slot = fresh_var() # y P.right.right.slot = L.slot P.left.right.slot.var = fresh_var('Z') P.right.left.right.slot = P.left.right.slot # z P.left.left.right.slot.var = fresh_var('W') P.right.left.left.right.slot = P.left.left.right.slot # w unifier = unify(L, P.right.right) elif P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/QP)'): # QPy -> [ (S[dcl]v\NPz)v \ ((S[dcl]v\NPz)v/QPy)v ]v P.slot.var = fresh_var() P.left.slot = P.left.left.slot = \ P.right.slot = P.right.left.slot = P.right.left.left.slot = P.slot # v # P.right.right.slot = fresh_var() # y P.right.right.slot = L.slot P.left.right.slot.var = fresh_var('Z') P.right.left.right.slot = P.left.right.slot # z unifier = unify(L, P.right.right) else: P.slot.var = fresh_var() P.right.left.slot = P.left.slot = P.right.slot = P.slot P.right.right.slot = L.slot unifier = unify(L, P.right.right) elif comb == 'np_typechange': P.slot = L.slot # = copy_vars unifier = unify(P, L) elif comb == 'lcp_np_typechange': P.slot = L.slot unifier = unify(P, L) elif comb in ('lcp_sfs_typechange', 'lcp_nfn_typechange'): P.left.slot.var = fresh_var() P.right.slot = P.left.slot P.slot = L.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'lcp_sbnpfsbnp_typechange': # [(Sy\NPz)y/(Sy\NPz)y]_ P.left.slot.var = fresh_var() P.left.left.slot = P.right.left.slot = P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'null_relativiser_typechange': # Xy -> (Nf/Nf)y P.slot = L.slot if P == _NfN: P.left.slot.var = fresh_var() P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif P == _NfNfNfN: P.left.slot.var = fresh_var() P.left.left.slot.var = fresh_var(prefix="G") P.left.right.slot = P.left.left.slot P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) else: warn("Unhandled null relativiser typechange: %s -> %s", L, P) # [NP/NP]y -> NPy elif comb == 'de_nominalisation': P.slot = L.slot register_unary(unaries, p, L.slot.head.lex) # {M, QP}y -> (Nf/Nf)y elif comb == 'measure_word_number_elision': P.slot = L.slot P.left.slot.var = fresh_var() P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'l_punct_absorb': # , X -> X[conj] # need to put conj feature back on parent p.cat = R.clone_adding_feature('conj') elif comb == 'r_punct_absorb': p.cat = L elif R and L == R and is_rooted_in(parse_category('S'), L): # VCD (stopgap) make_set_head_from(l, r, p) else: debug('Unhandled combinator %s (%s %s -> %s)', comb, L, R, P) unanalysed.add(comb) P.slot = R.slot if R else L.slot for (dest, src) in unifier: if isinstance(src, (basestring, list)): # Fake bidirectional unification: # ------------------------------- # If variable X has been unified with value v, # rewrite all mentions of v in the output category to point to variable X # (v is uniquified by concatenating it with an ID, so this should hold) for subcat in p.cat.nested_compound_categories(): if subcat.slot.head.lex == src: subcat.slot = dest.slot if config.debug: debug("> %s" % p.cat) debug('---') if config.fail_on_unassigned_variables: assert no_unassigned_variables(p.cat), "Unassigned variables in %s" % p.cat if config.debug: debug('unaries: %s', unaries) # Collect deps from arguments deps = [] for l in chain( leaves(root), unaries ): if config.debug: debug("%s %s", l, l.cat) C = l.cat while not C.is_leaf(): arg = C.right if arg.slot.head.filler: #and not l.cat.left.slot == l.cat.right.slot): # print "%s %s %s %s %s %s" % (C.slot.head.lex, C, arg.slot.head.lex, arg, l.cat, C.label) if C.label is None: warn("Dependency generated on slash without label: %s %s", C, arg) deps.append( (C.slot.head.lex, arg.slot.head.lex, l.cat, C.label) ) if is_modifier(C): break C = C.left # Produce dep pairs result = set() for depl, depr, head_cat, head_label in deps: for sdepl in set(seqify(depl)): for sdepr in set(seqify(depr)): if not (sdepl and sdepr): debug("Dependency with None: %s %s", sdepl, sdepr) continue result.add( (postprocessor(sdepl), postprocessor(sdepr), head_cat, head_label) ) if config.debug: for line in write_deps(result): debug(line) return result