def testEquality(self): cat1 = parse_category("((A\\B)/(C/D))/((A/B)/C)") cat2 = parse_category("((A[f]\\B[g])/(C[h]/D[i])[j])/((A[k]/B[l])/C[m])[n]") cat3 = parse_category("((A[f]/B[g])\\(C[h]/D[i])[j])\\((A[k]/B[l])/C[m])[n]") self.failIf(cat1.equal_respecting_features(cat2)) self.assert_(cat1 == cat2) self.assert_(cat1 != cat3)
def process_annotator_into_substs(self, fn): substs = {} slashes = defaultdict(set) with file(fn, 'r') as f: for lineno, line in enumerate(f): line = line.rstrip() fields = line.split() if len(fields) != 3: raise FilterException, ("Missing field at line %d of annotator file %s." % (lineno, self.anno_filename)) category_string, replacement_mode_string, slash_index = fields debug("Slash %s of %s goes to %s=%d", slash_index, re.sub(r'[-.*@]', '', category_string), replacement_mode_string,self.mode_string_to_index(replacement_mode_string)) slashes[re.sub(r'[-.*@]', '', category_string)].add( ( int(slash_index), self.mode_string_to_index(replacement_mode_string) )) for (category_string, replacements) in slashes.iteritems(): moded_category = parse_category(category_string) moded_category.labelled() for (subcategory, slash_index) in moded_category.slashes(): result = find(lambda (index, mode): index == slash_index, replacements) if result: replacement_slash, replacement_mode = result debug("Setting mode of slash %s of %s to %s", slash_index, moded_category, replacement_mode) subcategory.mode = replacement_mode substs[category_string] = moded_category return substs
def test_nested_compounds(self): cat1 = parse_category('((A\\.B)/.(C/.D))/.((A/.B)/.C)') nesteds = cat1.nested_compound_categories() self.assertEqual([str(nested) for nested in nesteds], ["((A\\.B)/.(C/.D))/.((A/.B)/.C)", "(A\\.B)/.(C/.D)", "A\\.B", "C/.D", "(A/.B)/.C", "A/.B"])
def load_splitdef_file(self, splitdef_file): cats_to_split = [] permitted_cats = defaultdict(list) reading_splits = True with file(splitdef_file, 'r') as def_file: for line in def_file: line = line.rstrip() if line == "%": reading_splits = False continue if reading_splits: cat, slash = line.split() cats_to_split.append( (parse_category(cat), int(slash)) ) else: old, new = line.split() old = re.sub(r'[-.*@]', '', old) permitted_cats[old].append( parse_category(new) ) return cats_to_split, permitted_cats
def process(deriv, locator, instr): '''Processes one script instruction, given the derivation on which it is to operate, a locator string identifying a node as the focus of the operation, and the instruction itself.''' locator = locator[:] # make a copy last_locator = locator.pop() cur_node = deriv for kid_index in locator: check_index(cur_node.kids, kid_index) cur_node = cur_node.kids[kid_index] check_index(cur_node.kids, last_locator) if last_locator != 'e': print "Locator names leaf %s" % cur_node.kids[last_locator] if instr == "d": # TODO: handle deleting the last kid, have to recursively delete. but PTB nodes have no parent ptr # otherwise assume this won't be done. # or we can have a node with empty kids yield an empty string representation del cur_node.kids[last_locator] elif instr.startswith("l"): # Sets the lexical item of a PTB node or CCGbank node. _, new_lex = instr.split('=') cur_node.kids[last_locator].lex = new_lex elif instr.startswith("t"): # Sets the POS tag of a PTB node. _, new_tag = instr.split('=') cur_node.kids[last_locator].tag = new_tag elif instr.startswith("c"): # Sets the category of a CCGbank node. _, new_cat = instr.split('=') cur_node.kids[last_locator].cat = parse_category(new_cat) elif instr.startswith("C"): _, new_bits = instr.split('=') cat, pos1, pos2, lex, catfix = new_bits.split('|') for attr in ('cat', 'pos1', 'pos2', 'lex', 'catfix'): value = locals()[attr] if value: # empty value for a field means do not change the field's value setattr(cur_node.kids[last_locator], attr, value) elif instr.startswith("i"): # Insert PTB leaf node. _, tag_and_lex = instr.split('=') tag, lex = tag_and_lex.split('|') new_leaf = penn.Leaf(tag, lex) if last_locator == 'e': cur_node.kids.append(new_leaf) else: cur_node.kids.insert(last_locator, new_leaf) elif instr.startswith('P') or instr.startswith('A'): # Prepend or append CCGbank absorption leaf node. Instruction is of the form # I=leaf_cat|leaf_pos1|leaf_lex|leaf_catfix|parent_cat # If you leave out parent_cat then absorption is assumed. prepend = instr.startswith('P') _, commalist = instr.split('=') cat, pos1, lex, catfix, parent_cat = commalist.split('|') cat = parse_category(cat) new_leaf = ccg.Leaf(cat, pos1, pos1, lex, catfix) # No parent # node_prepend and append return None if no new root was installed, or the new root otherwise if prepend: maybe_new_root = node_prepend( cur_node.kids[last_locator], new_leaf, parent_cat or cur_node.kids[last_locator].cat) else: maybe_new_root = node_append( cur_node.kids[last_locator], new_leaf, parent_cat or cur_node.kids[last_locator].cat) # Install a new root if one was created if maybe_new_root: deriv = maybe_new_root elif instr.startswith('S'): # Shrink absorption focus = cur_node.kids[last_locator] if focus.lch.is_leaf() and focus.rch.cat == focus.cat: maybe_new_root = shrink(focus, left_is_leaf=True) elif focus.rch.is_leaf() and focus.lch.cat == focus.cat: maybe_new_root = shrink(focus, left_is_leaf=False) else: raise SurgeryException( "The focused node must be an instance of absorption (X T -> T or T X -> T)." ) if maybe_new_root: deriv = maybe_new_root return deriv
def label_text(self): return re.escape(self.slash) is_leaf = const_(False) is_complex = const_(True) def __or__(self, right): '''Constructs the complex category (self \ right).''' return ComplexCategory(self, BACKWARD, right) def __div__(self, right): '''Constructs the complex category (self / right).''' return ComplexCategory(self, FORWARD, right) def __mod__(self, right): return ComplexCategory(self, BAR, right) if __name__ == '__main__': from munge.cats.parse import parse_category for cat, lab in { '(A/B)/C': '(A/1B)/2C', '(A/(B/D))/C': '(A/1(B/D))/2C', '((A/B)/D)/C': '((A/1B)/2D)/3C', 'A/(B/C)': 'A/1(B/C)' }.iteritems(): c = parse_category(cat) c.parg_labelled() print c.__repr__(show_label=True) assert c.__repr__(show_label=True) == lab
return re.escape(self.slash) is_leaf = const_(False) is_complex = const_(True) def __or__(self, right): """Constructs the complex category (self \ right).""" return ComplexCategory(self, BACKWARD, right) def __div__(self, right): """Constructs the complex category (self / right).""" return ComplexCategory(self, FORWARD, right) def __mod__(self, right): return ComplexCategory(self, BAR, right) if __name__ == "__main__": from munge.cats.parse import parse_category for cat, lab in { "(A/B)/C": "(A/1B)/2C", "(A/(B/D))/C": "(A/1(B/D))/2C", "((A/B)/D)/C": "((A/1B)/2D)/3C", "A/(B/C)": "A/1(B/C)", }.iteritems(): c = parse_category(cat) c.parg_labelled() print c.__repr__(show_label=True) assert c.__repr__(show_label=True) == lab
S, N, NP, PP = (AtomicCategory(atom) for atom in "S N NP PP".split()) LeftAbsorbedPunctuationCats = ", . `` : ; LRB RRB".split() RightAbsorbedPunctuationCats = ", . '' : ; LRB RRB".split() ConjPunctuationCats = ", ; :".split() if config.cn_puncts: LeftAbsorbedPunctuationCats += "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split( ) RightAbsorbedPunctuationCats += "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split( ) ConjPunctuationCats.append("LCM") SbNP, SfNP, NPbNP, NPfNP, NbN, NfN, SbNPbSbNP, \ SbS, SfS, SbNPfSbNP, conj = [parse_category(cat) for cat in '''S\\NP S/NP NP\\NP NP/NP N\\N N/N (S\\NP)\\(S\\NP) S\\S S/S (S\\NP)/(S\\NP) conj'''.split()] # Chinese topicalised cats SfSfNP, SfSfS = parse_category(r'S/(S/NP)'), parse_category(r'S/(S/S)') QP = parse_category('QP') SbNPfNP = parse_category(r'(S\NP)/NP') SdclbNPfNP = parse_category(r'(S[dcl]\NP)/NP') Sq, Sdcl = parse_category('S[q]'), parse_category('S[dcl]') Swq = parse_category('S[wq]') SadjbNP = parse_category(r'S[adj]\NP') SdclbNP, Sfrg = parse_category(r'S[dcl]\NP'), parse_category(r'S[frg]') Nnum = parse_category(r'N[num]') # Defines a short name for converting a category string to a category representation.
def process(deriv, locator, instr): '''Processes one script instruction, given the derivation on which it is to operate, a locator string identifying a node as the focus of the operation, and the instruction itself.''' locator = locator[:] # make a copy last_locator = locator.pop() cur_node = deriv for kid_index in locator: check_index(cur_node.kids, kid_index) cur_node = cur_node.kids[kid_index] check_index(cur_node.kids, last_locator) if last_locator != 'e': print "Locator names leaf %s" % cur_node.kids[last_locator] if instr == "d": # TODO: handle deleting the last kid, have to recursively delete. but PTB nodes have no parent ptr # otherwise assume this won't be done. # or we can have a node with empty kids yield an empty string representation del cur_node.kids[last_locator] elif instr.startswith("l"): # Sets the lexical item of a PTB node or CCGbank node. _, new_lex = instr.split('=') cur_node.kids[last_locator].lex = new_lex elif instr.startswith("t"): # Sets the POS tag of a PTB node. _, new_tag = instr.split('=') cur_node.kids[last_locator].tag = new_tag elif instr.startswith("c"): # Sets the category of a CCGbank node. _, new_cat = instr.split('=') cur_node.kids[last_locator].cat = parse_category(new_cat) elif instr.startswith("C"): _, new_bits = instr.split('=') cat, pos1, pos2, lex, catfix = new_bits.split('|') for attr in ('cat', 'pos1', 'pos2', 'lex', 'catfix'): value = locals()[attr] if value: # empty value for a field means do not change the field's value setattr(cur_node.kids[last_locator], attr, value) elif instr.startswith("i"): # Insert PTB leaf node. _, tag_and_lex = instr.split('=') tag, lex = tag_and_lex.split('|') new_leaf = penn.Leaf(tag, lex) if last_locator == 'e': cur_node.kids.append(new_leaf) else: cur_node.kids.insert(last_locator, new_leaf) elif instr.startswith('P') or instr.startswith('A'): # Prepend or append CCGbank absorption leaf node. Instruction is of the form # I=leaf_cat|leaf_pos1|leaf_lex|leaf_catfix|parent_cat # If you leave out parent_cat then absorption is assumed. prepend = instr.startswith('P') _, commalist = instr.split('=') cat, pos1, lex, catfix, parent_cat = commalist.split('|') cat = parse_category(cat) new_leaf = ccg.Leaf(cat, pos1, pos1, lex, catfix) # No parent # node_prepend and append return None if no new root was installed, or the new root otherwise if prepend: maybe_new_root = node_prepend(cur_node.kids[last_locator], new_leaf, parent_cat or cur_node.kids[last_locator].cat) else: maybe_new_root = node_append(cur_node.kids[last_locator], new_leaf, parent_cat or cur_node.kids[last_locator].cat) # Install a new root if one was created if maybe_new_root: deriv = maybe_new_root elif instr.startswith('S'): # Shrink absorption focus = cur_node.kids[last_locator] if focus.lch.is_leaf() and focus.rch.cat == focus.cat: maybe_new_root = shrink(focus, left_is_leaf=True) elif focus.rch.is_leaf() and focus.lch.cat == focus.cat: maybe_new_root = shrink(focus, left_is_leaf=False) else: raise SurgeryException("The focused node must be an instance of absorption (X T -> T or T X -> T).") if maybe_new_root: deriv = maybe_new_root return deriv
subcat.features = [] return ret S, N, NP, PP = (AtomicCategory(atom) for atom in "S N NP PP".split()) LeftAbsorbedPunctuationCats = ", . `` : ; LRB RRB".split() RightAbsorbedPunctuationCats = ", . '' : ; LRB RRB".split() ConjPunctuationCats = ", ; :".split() if config.cn_puncts: LeftAbsorbedPunctuationCats += "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split() RightAbsorbedPunctuationCats += "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split() ConjPunctuationCats.append("LCM") SbNP, SfNP, NPbNP, NPfNP, NbN, NfN, SbNPbSbNP, \ SbS, SfS, SbNPfSbNP, conj = [parse_category(cat) for cat in '''S\\NP S/NP NP\\NP NP/NP N\\N N/N (S\\NP)\\(S\\NP) S\\S S/S (S\\NP)/(S\\NP) conj'''.split()] # Chinese topicalised cats SfSfNP, SfSfS = parse_category(r'S/(S/NP)'), parse_category(r'S/(S/S)') QP = parse_category('QP') SbNPfNP = parse_category(r'(S\NP)/NP') SdclbNPfNP = parse_category(r'(S[dcl]\NP)/NP') Sq, Sdcl = parse_category('S[q]'), parse_category('S[dcl]') Swq = parse_category('S[wq]') SadjbNP = parse_category(r'S[adj]\NP') SdclbNP, Sfrg = parse_category(r'S[dcl]\NP'), parse_category(r'S[frg]') Nnum = parse_category(r'N[num]') # Defines a short name for converting a category string to a category representation.
def build_seq(self, iterable): for (l, r, was_flipped) in iterable: yield (parse_category(l), r and parse_category(r), was_flipped)