示例#1
0
def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher,
            quotes, quoter):
    '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.'''
    with file(ccg_auto_out, 'w') as ccg_out:
        with file(ccg_parg_out, 'w') as parg_out:
            penn_trees = list(PTBReader(ptb_file))
            ccg_trees = list(CCGbankReader(ccg_file))
            deps = list(CCGbankDepsReader(deps_file))

            matched_penn_trees = match_trees(penn_trees, ccg_trees)

            for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees,
                                                     ccg_trees, deps):
                ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation

                quote_spans = spans(ptb_tree)
                while quote_spans:
                    value = quote_spans.pop(0)
                    span_start, span_end, quote_type = value
                    if span_start is None and span_end is None: continue

                    info("Reinstating quotes to %s (%s, %s)",
                         ccg_bundle.label(), span_start, span_end)

                    ccg_tree, quote_indices = quoter.attach_quotes(
                        ccg_tree, span_start, span_end, quote_type, higher,
                        quotes)
                    # In case a new root has been installed, re-assign the new root to the CCGbank bundle
                    ccg_bundle.derivation = ccg_tree

                    # Shift remaining quote span indices by the number of quotes that have been inserted
                    quote_spans = fix_quote_spans(quote_spans, quote_indices)
                    dep = fix_dependencies(dep, quote_indices)

                print >> parg_out, dep
                print >> ccg_out, ccg_bundle
示例#2
0
 def setUp(self):
     self.assert_(os.path.exists('munge/tests/wsj_0003.auto'))
     self.tree = CCGbankReader('munge/tests/wsj_0003.auto')[ 4 ].derivation
     
     initialise()
示例#3
0
unrecognised_rules = defaultdict(lambda: 0)
total, with_unrecognised_rules = 0, 0
ucp_rules = defaultdict(lambda: 0)
with_ucp = 0

unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0)


def is_ucp(l, r, p):
    if r is None: return False

    return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r


for file in glob(sys.argv[1]):
    for bundle in CCGbankReader(file):
        has_unrecognised_rules, has_ucp = False, False

        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            lrp = map(lambda e: e and e.cat,
                      (node[0], node[1] if node.count() > 0 else None, node))

            comb = analyse(*lrp)
            l, r, p = lrp
            rule_tuple = (str(l), str(r), str(p))

            if comb:
                combs[comb] += 1
            elif is_ucp(*lrp):
示例#4
0
class TgrepTests(unittest.TestCase):
    def setUp(self):
        self.assert_(os.path.exists('munge/tests/wsj_0003.auto'))
        self.tree = CCGbankReader('munge/tests/wsj_0003.auto')[ 4 ].derivation
        
        initialise()
        
    def testCorrectTreeLoaded(self):
        self.assertEquals(self.tree.text(), "Although preliminary findings were reported more than a "
                                            "year ago , the latest results appear in today 's New England "
                                            "Journal of Medicine , a forum likely to bring new attention "
                                            "to the problem .".split())
                                            
    def testAtom(self):
        self.assertTrue(matches(self.tree, 'S[dcl]')) # root
        self.assertTrue(matches(self.tree, 'NP[conj]')) # internal node
        self.assertTrue(matches(self.tree, r'(S[to]\NP)/(S[b]\NP)')) # leaf
        
        self.assertFalse(matches(self.tree, 'A'))
        self.assertFalse(matches(self.tree, r'(S[to]\NP)/(S[to]\NP)'))
        self.assertFalse(matches(self.tree, 'BAR'))
        
    def testRegex(self):
        self.assertTrue(matches(self.tree, r'/\(S/S\)/+/')) # try to match (S/S)/S[dcl] 'Although'
        self.assertTrue(matches(self.tree, r'/NP\[.+\]/'))
        self.assertTrue(matches(self.tree, r'/\(S[/\\]NP\)[/\\]\(S[/\\]NP\)/'))
        
        self.assertFalse(matches(self.tree, r'/\(.+\)/\(NP/NP\)/'))
        self.assertFalse(matches(self.tree, r'/\[em\][\/].+/'))
        
    def testParent(self):
        self.assertTrue(matches(self.tree, r'S[dcl]\NP < (S[dcl]\NP)/(S[pss]\NP)'))
        self.assertTrue(matches(self.tree, r'NP[nb]/N < NP[nb]/N'))
        self.assertTrue(matches(self.tree, r'NP[nb]/N < (NP[nb]/N)\NP'))
        self.assertTrue(matches(self.tree, r'S[dcl] < "."')) # literal notation
        
        # dominates but not immediately dominates (leaf 'appear', 'in')
        self.assertFalse(matches(self.tree, r'S[dcl]\NP < ((S\NP)\(S\NP))/NP')) 
        # 'NP/NP' is the parent of 'S[adj]\NP' but not vice versa
        self.assertFalse(matches(self.tree, r'S[adj]\NP < NP/NP'))
        self.assertFalse(matches(self.tree, r'A < S[dcl]')) # Nowhere in tree
        
    def testDominates(self):
        self.assertTrue(matches(self.tree, r'S[dcl]\NP << ((S\NP)\(S\NP))/NP'))
        self.assertTrue(matches(self.tree, r'S[dcl] << "."'))
        self.assertTrue(matches(self.tree, r'S[dcl] << (NP/NP)\(S[adj]\NP)'))
        
        self.assertFalse(matches(self.tree, r'(NP[nb]/N)\NP << S[dcl]'))
        self.assertFalse(matches(self.tree, r'S[adj]\NP << NP/NP'))
        self.assertFalse(matches(self.tree, r'A << S[dcl]'))
        
    def testIsSiblingOf(self):
        self.assertTrue(matches(self.tree, r'((S\NP)\(S\NP))/NP $ NP'))
        self.assertFalse(matches(self.tree, r'((S\NP)\(S\NP))/NP $ (S[b]\NP)/NP'))
        
        self.assertTrue(matches(self.tree, r'NP $ NP[conj]'))
        self.assertTrue(matches(self.tree, r'NP[conj] $ NP'))
        self.assertFalse(matches(self.tree, r'S[dcl] $ NP'))
        
        self.assertFalse(matches(self.tree, r'A $ A'))
        self.assertFalse(matches(self.tree, r'S[dcl] $ S[dcl]'))
        
    def testIsLeftChildOf(self):
        self.assertTrue(matches(self.tree, r'NP <1 NP[nb]/N'))
        self.assertFalse(matches(self.tree, r'NP <2 NP[nb]/N'))
        
        self.assertFalse(matches(self.tree, r'A <1 B'))
        
    def testIsRightChildOf(self):
        self.assertTrue(matches(self.tree, r'S[to]\NP <2 S[b]\NP'))
        self.assertFalse(matches(self.tree, r'S[to]\NP <1 S[b]\NP'))
        
        # NP -> N is a unary conversion, so has no right child
        # I am trying to address subtree 'preliminary findings'
        self.assertFalse(matches(self.tree, r'{NP $ { (S/S)/S[dcl] $ S[dcl]\NP } } <2 {N <1 N/N <2 N}'))
        
        self.assertFalse(matches(self.tree, r'A <2 B'))

    def testAlternation(self):
        self.assertTrue(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} > (S\NP)\(S\NP) | < T'))
        self.assertTrue(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} < T | > (S\NP)\(S\NP)'))
        self.assertFalse(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} < A | > B | $ C'))
        self.assertTrue(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} < A | > (S\NP)\(S\NP) | $ C'))
示例#5
0
def load_ccgbank_tree(fn, deriv_no):
    for i, doc in enumerate(CCGbankReader(fn)):
        if i == deriv_no: return doc.derivation
    return None