def test_rst_to_dt(self): lw_trees = ["(R:rel (S x) (N y))", """ (R:rel (S x) (N:rel (N h) (S t))) """, """ (R:r (S x) (N:r (N:r (S t1) (N h)) (S t2))) """ ] for lstr in lw_trees: rst1 = parse_lightweight_tree(lstr) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual(rst1, rst2, "round-trip on " + lstr) for name, tree in self._test_trees().items(): rst1 = SimpleRSTTree.from_rst_tree(tree) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual(treenode(rst1).span, treenode(rst2).span, "span equality on " + name) self.assertEqual(treenode(rst1).edu_span, treenode(rst2).edu_span, "edu span equality on " + name)
def test_rst_to_dt(self): lw_trees = [ "(R:rel (S x) (N y))", """ (R:rel (S x) (N:rel (N h) (S t))) """, """ (R:r (S x) (N:r (N:r (S t1) (N h)) (S t2))) """ ] for lstr in lw_trees: rst1 = parse_lightweight_tree(lstr) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual(rst1, rst2, "round-trip on " + lstr) for name, tree in self._test_trees().items(): rst1 = SimpleRSTTree.from_rst_tree(tree) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual( treenode(rst1).span, treenode(rst2).span, "span equality on " + name) self.assertEqual( treenode(rst1).edu_span, treenode(rst2).edu_span, "edu span equality on " + name)
def convert(corpus, multinuclear, odir): """ Convert every RST tree in the corpus to a dependency tree (and back, but simplified using a set of relation types that will be systematically treated as multinuclear) """ bin_dir = os.path.join(odir, "rst-binarised") dt_dir = os.path.join(odir, "rst-to-dt") rst2_dir = os.path.join(odir, "dt-to-rst") for subdir in [bin_dir, dt_dir, rst2_dir]: if not os.path.exists(subdir): os.makedirs(subdir) for k in corpus: suffix = os.path.splitext(k.doc)[0] stree = educe.rst_dt.SimpleRSTTree.from_rst_tree(corpus[k]) with open(os.path.join(bin_dir, suffix), 'w') as fout: fout.write(str(stree)) dtree = RstDepTree.from_simple_rst_tree(stree) with open(os.path.join(dt_dir, suffix), 'w') as fout: fout.write(str(dtree)) stree2 = deptree_to_simple_rst_tree(dtree, multinuclear) with open(os.path.join(rst2_dir, suffix), 'w') as fout: fout.write(str(stree2))
def convert(corpus, multinuclear, odir): """ Convert every RST tree in the corpus to a dependency tree (and back, but simplified using a set of relation types that will be systematically treated as multinuclear) """ bin_dir = os.path.join(odir, "rst-binarised") dt_dir = os.path.join(odir, "rst-to-dt") rst2_dir = os.path.join(odir, "dt-to-rst") for subdir in [bin_dir, dt_dir, rst2_dir]: if not os.path.exists(subdir): os.makedirs(subdir) for k in corpus: suffix = os.path.splitext(k.doc)[0] stree = SimpleRSTTree.from_rst_tree(corpus[k]) with open(os.path.join(bin_dir, suffix), 'w') as fout: fout.write(str(stree)) dtree = RstDepTree.from_simple_rst_tree(stree) with open(os.path.join(dt_dir, suffix), 'w') as fout: fout.write(str(dtree)) stree2 = deptree_to_simple_rst_tree(dtree) with open(os.path.join(rst2_dir, suffix), 'w') as fout: fout.write(str(stree2))
def test_rst_to_dt_nuclearity_loss(self): """ Test that we still get sane tree structure with nuclearity loss """ tricky = """ (R:r (S t) (N h)) """ nuked = """ (R:r (N t) (N h)) """ # tricky = """ # (R:r # (S x) # (N:r (N:r (S t1) (N h)) # (S t2))) # """ # # nuked = """ # (R:r # (N x) # (N:r (N:r (N t1) (N h)) # (N t2))) # """ rst0 = parse_lightweight_tree(nuked) rst1 = parse_lightweight_tree(tricky) # a little sanity check first dep0 = RstDepTree.from_simple_rst_tree(rst0) rev0 = deptree_to_simple_rst_tree(dep0) # was:, ['r']) self.assertEqual(rst0, rev0, "same structure " + nuked) # sanity # now the real test dep1 = RstDepTree.from_simple_rst_tree(rst1) rev1 = deptree_to_simple_rst_tree(dep1) # was:, ['r'])
def test_dt_to_rst_order(self): lw_trees = [ "(R:r (N:r (N h) (S r1)) (S r2))", "(R:r (S:r (S l2) (N l1)) (N h))", "(R:r (N:r (S l1) (N h)) (S r1))", """ (R:r (N:r (N:r (S l2) (N:r (S l1) (N h))) (S r1)) (S r2)) """, # ((l2 <- l1 <- h) -> r1 -> r2) """ (R:r (N:r (S l2) (N:r (N:r (S l1) (N h)) (S r1))) (S r2)) """, # (l2 <- ((l1 <- h) -> r1)) -> r2 ] for lstr in lw_trees: rst1 = parse_lightweight_tree(lstr) dep = RstDepTree.from_simple_rst_tree(rst1) dep_a = dep rst2a = deptree_to_simple_rst_tree(dep_a) self.assertEqual(rst1, rst2a, "round-trip on " + lstr) dep_b = copy.deepcopy(dep) dep_b.deps(0).reverse() rst2b = deptree_to_simple_rst_tree(dep_b) # TODO assertion on rst2b? dep_c = copy.deepcopy(dep) random.shuffle(dep_c.deps(0)) rst2c = deptree_to_simple_rst_tree(dep_c)
'corpus': os.path.relpath(rst_corpus_dir, start=DATA_DIR), 'strip_accents': strip_accents, 'lowercase': lowercase, 'stop_words': stop_words, 'n_jobs': n_jobs, 'verbose': verbose, } print('# parameters: ({})'.format(params), file=outfile) # do the real job corpus_items = sorted(rst_corpus.items()) doc_keys = [key.doc for key, doc in corpus_items] doc_key_dtrees = [ (doc_key.doc, RstDepTree.from_simple_rst_tree(SimpleRSTTree.from_rst_tree(doc))) for doc_key, doc in corpus_items ] edu_txts = list(e.text().replace('\n', ' ') for doc_key, dtree in doc_key_dtrees for e in dtree.edus) # vectorize each EDU using its text edu_vecs = vect.transform(edu_txts) # normalize each row of the count matrix using the l1 norm # (copy=False to perform in place) edu_vecs = normalize(edu_vecs, norm='l1', copy=False) # get all pairs of EDUs of interest, here as triples # (gov_idx, dep_idx, lbl) # TODO maybe sort edu pairs so that dependents with # the same governor are grouped (potential speed up?) edu_pairs = [