예제 #1
0
    def testEquality(self):
        cat1 = parse_category("((A\\B)/(C/D))/((A/B)/C)")
        cat2 = parse_category("((A[f]\\B[g])/(C[h]/D[i])[j])/((A[k]/B[l])/C[m])[n]")
        cat3 = parse_category("((A[f]/B[g])\\(C[h]/D[i])[j])\\((A[k]/B[l])/C[m])[n]")

        self.failIf(cat1.equal_respecting_features(cat2))
        self.assert_(cat1 == cat2)
        self.assert_(cat1 != cat3)
예제 #2
0
 def process_annotator_into_substs(self, fn):
     substs = {}
     
     slashes = defaultdict(set)
     with file(fn, 'r') as f:
         for lineno, line in enumerate(f):
             line = line.rstrip()
             
             fields = line.split()
             if len(fields) != 3:
                 raise FilterException, ("Missing field at line %d of annotator file %s." 
                                         % (lineno, self.anno_filename))
                                         
             category_string, replacement_mode_string, slash_index = fields
             debug("Slash %s of %s goes to %s=%d", slash_index, re.sub(r'[-.*@]', '', category_string), replacement_mode_string,self.mode_string_to_index(replacement_mode_string))
             slashes[re.sub(r'[-.*@]', '', category_string)].add(
                                     ( int(slash_index), self.mode_string_to_index(replacement_mode_string) ))
             
         for (category_string, replacements) in slashes.iteritems():
             moded_category = parse_category(category_string)
             moded_category.labelled()
             
             for (subcategory, slash_index) in moded_category.slashes():
                 result = find(lambda (index, mode): index == slash_index, replacements)
                 if result:
                     replacement_slash, replacement_mode = result
                     debug("Setting mode of slash %s of %s to %s", slash_index, moded_category, replacement_mode)
                     subcategory.mode = replacement_mode
                     
             substs[category_string] = moded_category
     
     return substs
예제 #3
0
 def test_nested_compounds(self):
     cat1 = parse_category('((A\\.B)/.(C/.D))/.((A/.B)/.C)')
     nesteds = cat1.nested_compound_categories()
     self.assertEqual([str(nested) for nested in nesteds],
                      ["((A\\.B)/.(C/.D))/.((A/.B)/.C)",
                       "(A\\.B)/.(C/.D)", "A\\.B", "C/.D",
                       "(A/.B)/.C", "A/.B"])
예제 #4
0
 def load_splitdef_file(self, splitdef_file):
     cats_to_split = []
     permitted_cats = defaultdict(list)
     
     reading_splits = True
     
     with file(splitdef_file, 'r') as def_file:
         for line in def_file:
             line = line.rstrip()
             
             if line == "%":
                 reading_splits = False
                 continue
             
             if reading_splits:
                 cat, slash = line.split()
                 cats_to_split.append( (parse_category(cat), int(slash)) )
             else:
                 old, new = line.split()
                 old = re.sub(r'[-.*@]', '', old)
                 
                 permitted_cats[old].append( parse_category(new) )
     
     return cats_to_split, permitted_cats
예제 #5
0
def process(deriv, locator, instr):
    '''Processes one script instruction, given the derivation on which it is to operate, a locator string
identifying a node as the focus of the operation, and the instruction itself.'''
    locator = locator[:]  # make a copy
    last_locator = locator.pop()

    cur_node = deriv

    for kid_index in locator:
        check_index(cur_node.kids, kid_index)
        cur_node = cur_node.kids[kid_index]

    check_index(cur_node.kids, last_locator)
    if last_locator != 'e':
        print "Locator names leaf %s" % cur_node.kids[last_locator]

    if instr == "d":
        # TODO: handle deleting the last kid, have to recursively delete. but PTB nodes have no parent ptr
        # otherwise assume this won't be done.
        # or we can have a node with empty kids yield an empty string representation
        del cur_node.kids[last_locator]
    elif instr.startswith("l"):
        # Sets the lexical item of a PTB node or CCGbank node.
        _, new_lex = instr.split('=')
        cur_node.kids[last_locator].lex = new_lex
    elif instr.startswith("t"):
        # Sets the POS tag of a PTB node.
        _, new_tag = instr.split('=')
        cur_node.kids[last_locator].tag = new_tag
    elif instr.startswith("c"):
        # Sets the category of a CCGbank node.
        _, new_cat = instr.split('=')
        cur_node.kids[last_locator].cat = parse_category(new_cat)
    elif instr.startswith("C"):
        _, new_bits = instr.split('=')
        cat, pos1, pos2, lex, catfix = new_bits.split('|')

        for attr in ('cat', 'pos1', 'pos2', 'lex', 'catfix'):
            value = locals()[attr]
            if value:  # empty value for a field means do not change the field's value
                setattr(cur_node.kids[last_locator], attr, value)
    elif instr.startswith("i"):
        # Insert PTB leaf node.
        _, tag_and_lex = instr.split('=')
        tag, lex = tag_and_lex.split('|')

        new_leaf = penn.Leaf(tag, lex)
        if last_locator == 'e':
            cur_node.kids.append(new_leaf)
        else:
            cur_node.kids.insert(last_locator, new_leaf)
    elif instr.startswith('P') or instr.startswith('A'):
        # Prepend or append CCGbank absorption leaf node. Instruction is of the form
        # I=leaf_cat|leaf_pos1|leaf_lex|leaf_catfix|parent_cat
        # If you leave out parent_cat then absorption is assumed.
        prepend = instr.startswith('P')

        _, commalist = instr.split('=')
        cat, pos1, lex, catfix, parent_cat = commalist.split('|')
        cat = parse_category(cat)

        new_leaf = ccg.Leaf(cat, pos1, pos1, lex, catfix)  # No parent
        # node_prepend and append return None if no new root was installed, or the new root otherwise
        if prepend:
            maybe_new_root = node_prepend(
                cur_node.kids[last_locator], new_leaf, parent_cat
                or cur_node.kids[last_locator].cat)
        else:
            maybe_new_root = node_append(
                cur_node.kids[last_locator], new_leaf, parent_cat
                or cur_node.kids[last_locator].cat)

        # Install a new root if one was created
        if maybe_new_root:
            deriv = maybe_new_root

    elif instr.startswith('S'):  # Shrink absorption
        focus = cur_node.kids[last_locator]
        if focus.lch.is_leaf() and focus.rch.cat == focus.cat:
            maybe_new_root = shrink(focus, left_is_leaf=True)
        elif focus.rch.is_leaf() and focus.lch.cat == focus.cat:
            maybe_new_root = shrink(focus, left_is_leaf=False)
        else:
            raise SurgeryException(
                "The focused node must be an instance of absorption (X T -> T or T X -> T)."
            )

        if maybe_new_root:
            deriv = maybe_new_root

    return deriv
예제 #6
0
    def label_text(self):
        return re.escape(self.slash)

    is_leaf = const_(False)
    is_complex = const_(True)

    def __or__(self, right):
        '''Constructs the complex category (self \ right).'''
        return ComplexCategory(self, BACKWARD, right)

    def __div__(self, right):
        '''Constructs the complex category (self / right).'''
        return ComplexCategory(self, FORWARD, right)

    def __mod__(self, right):
        return ComplexCategory(self, BAR, right)


if __name__ == '__main__':
    from munge.cats.parse import parse_category
    for cat, lab in {
            '(A/B)/C': '(A/1B)/2C',
            '(A/(B/D))/C': '(A/1(B/D))/2C',
            '((A/B)/D)/C': '((A/1B)/2D)/3C',
            'A/(B/C)': 'A/1(B/C)'
    }.iteritems():
        c = parse_category(cat)
        c.parg_labelled()
        print c.__repr__(show_label=True)
        assert c.__repr__(show_label=True) == lab
예제 #7
0
        return re.escape(self.slash)

    is_leaf = const_(False)
    is_complex = const_(True)

    def __or__(self, right):
        """Constructs the complex category (self \ right)."""
        return ComplexCategory(self, BACKWARD, right)

    def __div__(self, right):
        """Constructs the complex category (self / right)."""
        return ComplexCategory(self, FORWARD, right)

    def __mod__(self, right):
        return ComplexCategory(self, BAR, right)


if __name__ == "__main__":
    from munge.cats.parse import parse_category

    for cat, lab in {
        "(A/B)/C": "(A/1B)/2C",
        "(A/(B/D))/C": "(A/1(B/D))/2C",
        "((A/B)/D)/C": "((A/1B)/2D)/3C",
        "A/(B/C)": "A/1(B/C)",
    }.iteritems():
        c = parse_category(cat)
        c.parg_labelled()
        print c.__repr__(show_label=True)
        assert c.__repr__(show_label=True) == lab
예제 #8
0
S, N, NP, PP = (AtomicCategory(atom) for atom in "S N NP PP".split())

LeftAbsorbedPunctuationCats = ", . `` : ; LRB RRB".split()
RightAbsorbedPunctuationCats = ", . '' : ; LRB RRB".split()
ConjPunctuationCats = ", ; :".split()

if config.cn_puncts:
    LeftAbsorbedPunctuationCats += "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split(
    )
    RightAbsorbedPunctuationCats += "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split(
    )
    ConjPunctuationCats.append("LCM")

SbNP, SfNP, NPbNP, NPfNP, NbN, NfN, SbNPbSbNP, \
SbS, SfS, SbNPfSbNP, conj = [parse_category(cat) for cat in
                        '''S\\NP S/NP NP\\NP NP/NP N\\N N/N (S\\NP)\\(S\\NP)
                           S\\S S/S (S\\NP)/(S\\NP) conj'''.split()]

# Chinese topicalised cats
SfSfNP, SfSfS = parse_category(r'S/(S/NP)'), parse_category(r'S/(S/S)')
QP = parse_category('QP')

SbNPfNP = parse_category(r'(S\NP)/NP')
SdclbNPfNP = parse_category(r'(S[dcl]\NP)/NP')
Sq, Sdcl = parse_category('S[q]'), parse_category('S[dcl]')
Swq = parse_category('S[wq]')
SadjbNP = parse_category(r'S[adj]\NP')
SdclbNP, Sfrg = parse_category(r'S[dcl]\NP'), parse_category(r'S[frg]')
Nnum = parse_category(r'N[num]')
# Defines a short name for converting a category string to a category representation.
예제 #9
0
def process(deriv, locator, instr):
    '''Processes one script instruction, given the derivation on which it is to operate, a locator string
identifying a node as the focus of the operation, and the instruction itself.'''
    locator = locator[:] # make a copy
    last_locator = locator.pop()
    
    cur_node = deriv

    for kid_index in locator:
        check_index(cur_node.kids, kid_index)
        cur_node = cur_node.kids[kid_index]
    
    check_index(cur_node.kids, last_locator)
    if last_locator != 'e':
        print "Locator names leaf %s" % cur_node.kids[last_locator]
    
    if instr == "d":
        # TODO: handle deleting the last kid, have to recursively delete. but PTB nodes have no parent ptr
        # otherwise assume this won't be done.
        # or we can have a node with empty kids yield an empty string representation
        del cur_node.kids[last_locator]
    elif instr.startswith("l"):
        # Sets the lexical item of a PTB node or CCGbank node.
        _, new_lex = instr.split('=')
        cur_node.kids[last_locator].lex = new_lex
    elif instr.startswith("t"):
        # Sets the POS tag of a PTB node.
        _, new_tag = instr.split('=')
        cur_node.kids[last_locator].tag = new_tag
    elif instr.startswith("c"):
        # Sets the category of a CCGbank node.
        _, new_cat = instr.split('=')
        cur_node.kids[last_locator].cat = parse_category(new_cat)
    elif instr.startswith("C"):
        _, new_bits = instr.split('=')
        cat, pos1, pos2, lex, catfix = new_bits.split('|')
        
        for attr in ('cat', 'pos1', 'pos2', 'lex', 'catfix'):
            value = locals()[attr]
            if value: # empty value for a field means do not change the field's value
                setattr(cur_node.kids[last_locator], attr, value)
    elif instr.startswith("i"):
        # Insert PTB leaf node.
        _, tag_and_lex = instr.split('=')
        tag, lex = tag_and_lex.split('|')

        new_leaf = penn.Leaf(tag, lex)
        if last_locator == 'e':
            cur_node.kids.append(new_leaf)
        else:
            cur_node.kids.insert(last_locator, new_leaf)
    elif instr.startswith('P') or instr.startswith('A'):
        # Prepend or append CCGbank absorption leaf node. Instruction is of the form 
        # I=leaf_cat|leaf_pos1|leaf_lex|leaf_catfix|parent_cat
        # If you leave out parent_cat then absorption is assumed.
        prepend = instr.startswith('P')
        
        _, commalist = instr.split('=')
        cat, pos1, lex, catfix, parent_cat = commalist.split('|')
        cat = parse_category(cat)
        
        new_leaf = ccg.Leaf(cat, pos1, pos1, lex, catfix) # No parent
        # node_prepend and append return None if no new root was installed, or the new root otherwise
        if prepend:
            maybe_new_root = node_prepend(cur_node.kids[last_locator], new_leaf, parent_cat or cur_node.kids[last_locator].cat)
        else:
            maybe_new_root = node_append(cur_node.kids[last_locator], new_leaf, parent_cat or cur_node.kids[last_locator].cat)
    
        # Install a new root if one was created
        if maybe_new_root:
            deriv = maybe_new_root
            
    elif instr.startswith('S'): # Shrink absorption
        focus = cur_node.kids[last_locator]
        if focus.lch.is_leaf() and focus.rch.cat == focus.cat:
            maybe_new_root = shrink(focus, left_is_leaf=True)
        elif focus.rch.is_leaf() and focus.lch.cat == focus.cat:
            maybe_new_root = shrink(focus, left_is_leaf=False)
        else:
            raise SurgeryException("The focused node must be an instance of absorption (X T -> T or T X -> T).")
            
        if maybe_new_root:
            deriv = maybe_new_root

    return deriv
예제 #10
0
        subcat.features = []
    return ret

S, N, NP, PP = (AtomicCategory(atom) for atom in "S N NP PP".split())

LeftAbsorbedPunctuationCats = ", . `` : ; LRB RRB".split()
RightAbsorbedPunctuationCats = ", . '' : ; LRB RRB".split()
ConjPunctuationCats = ", ; :".split()

if config.cn_puncts:
    LeftAbsorbedPunctuationCats +=  "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split()
    RightAbsorbedPunctuationCats += "LCM LPA RPA LQU RQU LSQ RSQ LTL RTL LCD RCD LCS RCS DSH SLS ? !".split()
    ConjPunctuationCats.append("LCM")

SbNP, SfNP, NPbNP, NPfNP, NbN, NfN, SbNPbSbNP, \
SbS, SfS, SbNPfSbNP, conj = [parse_category(cat) for cat in
                        '''S\\NP S/NP NP\\NP NP/NP N\\N N/N (S\\NP)\\(S\\NP)
                           S\\S S/S (S\\NP)/(S\\NP) conj'''.split()]
                           
# Chinese topicalised cats
SfSfNP, SfSfS = parse_category(r'S/(S/NP)'), parse_category(r'S/(S/S)')
QP = parse_category('QP')

SbNPfNP = parse_category(r'(S\NP)/NP')
SdclbNPfNP = parse_category(r'(S[dcl]\NP)/NP')
Sq, Sdcl = parse_category('S[q]'), parse_category('S[dcl]')
Swq = parse_category('S[wq]')
SadjbNP = parse_category(r'S[adj]\NP')
SdclbNP, Sfrg = parse_category(r'S[dcl]\NP'), parse_category(r'S[frg]')
Nnum = parse_category(r'N[num]')
# Defines a short name for converting a category string to a category representation.         
예제 #11
0
 def build_seq(self, iterable):
     for (l, r, was_flipped) in iterable:
         yield (parse_category(l), r and parse_category(r), was_flipped)