def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher, quotes, quoter): '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.''' with file(ccg_auto_out, 'w') as ccg_out: with file(ccg_parg_out, 'w') as parg_out: penn_trees = list(PTBReader(ptb_file)) ccg_trees = list(CCGbankReader(ccg_file)) deps = list(CCGbankDepsReader(deps_file)) matched_penn_trees = match_trees(penn_trees, ccg_trees) for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees, ccg_trees, deps): ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation quote_spans = spans(ptb_tree) while quote_spans: value = quote_spans.pop(0) span_start, span_end, quote_type = value if span_start is None and span_end is None: continue info("Reinstating quotes to %s (%s, %s)", ccg_bundle.label(), span_start, span_end) ccg_tree, quote_indices = quoter.attach_quotes( ccg_tree, span_start, span_end, quote_type, higher, quotes) # In case a new root has been installed, re-assign the new root to the CCGbank bundle ccg_bundle.derivation = ccg_tree # Shift remaining quote span indices by the number of quotes that have been inserted quote_spans = fix_quote_spans(quote_spans, quote_indices) dep = fix_dependencies(dep, quote_indices) print >> parg_out, dep print >> ccg_out, ccg_bundle
def setUp(self): self.assert_(os.path.exists('munge/tests/wsj_0003.auto')) self.tree = CCGbankReader('munge/tests/wsj_0003.auto')[ 4 ].derivation initialise()
unrecognised_rules = defaultdict(lambda: 0) total, with_unrecognised_rules = 0, 0 ucp_rules = defaultdict(lambda: 0) with_ucp = 0 unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0) def is_ucp(l, r, p): if r is None: return False return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r for file in glob(sys.argv[1]): for bundle in CCGbankReader(file): has_unrecognised_rules, has_ucp = False, False for node in nodes(bundle.derivation): if node.is_leaf(): continue lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node)) comb = analyse(*lrp) l, r, p = lrp rule_tuple = (str(l), str(r), str(p)) if comb: combs[comb] += 1 elif is_ucp(*lrp):
class TgrepTests(unittest.TestCase): def setUp(self): self.assert_(os.path.exists('munge/tests/wsj_0003.auto')) self.tree = CCGbankReader('munge/tests/wsj_0003.auto')[ 4 ].derivation initialise() def testCorrectTreeLoaded(self): self.assertEquals(self.tree.text(), "Although preliminary findings were reported more than a " "year ago , the latest results appear in today 's New England " "Journal of Medicine , a forum likely to bring new attention " "to the problem .".split()) def testAtom(self): self.assertTrue(matches(self.tree, 'S[dcl]')) # root self.assertTrue(matches(self.tree, 'NP[conj]')) # internal node self.assertTrue(matches(self.tree, r'(S[to]\NP)/(S[b]\NP)')) # leaf self.assertFalse(matches(self.tree, 'A')) self.assertFalse(matches(self.tree, r'(S[to]\NP)/(S[to]\NP)')) self.assertFalse(matches(self.tree, 'BAR')) def testRegex(self): self.assertTrue(matches(self.tree, r'/\(S/S\)/+/')) # try to match (S/S)/S[dcl] 'Although' self.assertTrue(matches(self.tree, r'/NP\[.+\]/')) self.assertTrue(matches(self.tree, r'/\(S[/\\]NP\)[/\\]\(S[/\\]NP\)/')) self.assertFalse(matches(self.tree, r'/\(.+\)/\(NP/NP\)/')) self.assertFalse(matches(self.tree, r'/\[em\][\/].+/')) def testParent(self): self.assertTrue(matches(self.tree, r'S[dcl]\NP < (S[dcl]\NP)/(S[pss]\NP)')) self.assertTrue(matches(self.tree, r'NP[nb]/N < NP[nb]/N')) self.assertTrue(matches(self.tree, r'NP[nb]/N < (NP[nb]/N)\NP')) self.assertTrue(matches(self.tree, r'S[dcl] < "."')) # literal notation # dominates but not immediately dominates (leaf 'appear', 'in') self.assertFalse(matches(self.tree, r'S[dcl]\NP < ((S\NP)\(S\NP))/NP')) # 'NP/NP' is the parent of 'S[adj]\NP' but not vice versa self.assertFalse(matches(self.tree, r'S[adj]\NP < NP/NP')) self.assertFalse(matches(self.tree, r'A < S[dcl]')) # Nowhere in tree def testDominates(self): self.assertTrue(matches(self.tree, r'S[dcl]\NP << ((S\NP)\(S\NP))/NP')) self.assertTrue(matches(self.tree, r'S[dcl] << "."')) self.assertTrue(matches(self.tree, r'S[dcl] << (NP/NP)\(S[adj]\NP)')) self.assertFalse(matches(self.tree, r'(NP[nb]/N)\NP << S[dcl]')) self.assertFalse(matches(self.tree, r'S[adj]\NP << NP/NP')) self.assertFalse(matches(self.tree, r'A << S[dcl]')) def testIsSiblingOf(self): self.assertTrue(matches(self.tree, r'((S\NP)\(S\NP))/NP $ NP')) self.assertFalse(matches(self.tree, r'((S\NP)\(S\NP))/NP $ (S[b]\NP)/NP')) self.assertTrue(matches(self.tree, r'NP $ NP[conj]')) self.assertTrue(matches(self.tree, r'NP[conj] $ NP')) self.assertFalse(matches(self.tree, r'S[dcl] $ NP')) self.assertFalse(matches(self.tree, r'A $ A')) self.assertFalse(matches(self.tree, r'S[dcl] $ S[dcl]')) def testIsLeftChildOf(self): self.assertTrue(matches(self.tree, r'NP <1 NP[nb]/N')) self.assertFalse(matches(self.tree, r'NP <2 NP[nb]/N')) self.assertFalse(matches(self.tree, r'A <1 B')) def testIsRightChildOf(self): self.assertTrue(matches(self.tree, r'S[to]\NP <2 S[b]\NP')) self.assertFalse(matches(self.tree, r'S[to]\NP <1 S[b]\NP')) # NP -> N is a unary conversion, so has no right child # I am trying to address subtree 'preliminary findings' self.assertFalse(matches(self.tree, r'{NP $ { (S/S)/S[dcl] $ S[dcl]\NP } } <2 {N <1 N/N <2 N}')) self.assertFalse(matches(self.tree, r'A <2 B')) def testAlternation(self): self.assertTrue(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} > (S\NP)\(S\NP) | < T')) self.assertTrue(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} < T | > (S\NP)\(S\NP)')) self.assertFalse(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} < A | > B | $ C')) self.assertTrue(matches(self.tree, r'{((S\NP)\(S\NP))\NP $ NP} < A | > (S\NP)\(S\NP) | $ C'))
def load_ccgbank_tree(fn, deriv_no): for i, doc in enumerate(CCGbankReader(fn)): if i == deriv_no: return doc.derivation return None