def test_optimalbinarize(): """Verify that all optimal parsing complexities are lower than or equal to the complexities of right-to-left binarizations.""" from discodop.treetransforms import optimalbinarize, complexityfanout from discodop.treebank import NegraCorpusReader corpus = NegraCorpusReader('alpinosample.export', punct='move') total = violations = violationshd = 0 for n, (tree, sent) in enumerate(zip(list( corpus.trees().values())[:-2000], corpus.sents().values())): t = addbitsets(tree) if all(fanout(x) == 1 for x in t.subtrees()): continue print(n, tree, '\n', ' '.join(sent)) total += 1 optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1) # undo head-ordering to get a normal right-to-left binarization normbin = addbitsets(binarize(canonicalize(Tree.convert(tree)))) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('non-hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violations += 1 optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1) normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1)) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violationshd += 1 print('opt. bin. violations normal: %d / %d; hd: %d / %d' % ( violations, total, violationshd, total)) assert violations == violationshd == 0
def treebankfanout(trees): """ Get maximal fan-out of a list of trees. """ # avoid max over empty sequence: 'treebank' may only have unary productions try: return max((fanout(a), n) for n, tree in enumerate(trees) for a in addbitsets(tree).subtrees(lambda x: len(x) > 1)) except ValueError: return 1, 0
def treebankfanout(trees): """Get maximal fan-out of a list of trees.""" from discodop.treetransforms import addbitsets, fanout try: # avoid max over empty sequence: 'treebank' may only have unary prods return max((fanout(a), n) for n, tree in enumerate(trees) for a in addbitsets(tree).subtrees(lambda x: len(x) > 1)) except ValueError: return 1, 0
def test_punct(): """Verify that punctuation movement does not increase fan-out.""" def phrasal(x): return x and isinstance(x[0], Tree) from discodop.treebank import NegraCorpusReader filename = 'alpinosample.export' mangledtrees = NegraCorpusReader(filename, punct='move') nopunct = list( NegraCorpusReader(filename, punct='remove').trees().values()) originals = list( NegraCorpusReader(filename, headrules=None, encoding='iso-8859-1').trees().values()) for n, mangled, sent, nopunct, original in zip( count(), mangledtrees.trees().values(), mangledtrees.sents().values(), nopunct, originals): print(n, end='. ') for a, b in zip( sorted(addbitsets(mangled).subtrees(phrasal), key=lambda n: min(n.leaves())), sorted(addbitsets(nopunct).subtrees(phrasal), key=lambda n: min(n.leaves()))): if fanout(a) != fanout(b): print(' '.join(sent)) print(mangled) print(nopunct) print(original) assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % (fanout(a), fanout(b), a, b) print()
def testpunct(): """ Verify that punctuation movement does not increase fan-out. """ from discodop.treetransforms import addbitsets, fanout from discodop.treebank import NegraCorpusReader filename = 'alpinosample.export' mangledtrees = NegraCorpusReader('.', filename, punct='move') nopunct = list(NegraCorpusReader('.', filename, punct='remove').parsed_sents().values()) originals = list(NegraCorpusReader('.', filename, headrules=None, encoding='iso-8859-1').parsed_sents().values()) phrasal = lambda x: len(x) and isinstance(x[0], Tree) for n, mangled, sent, nopunct, original in zip(count(), mangledtrees.parsed_sents().values(), mangledtrees.sents().values(), nopunct, originals): print(n, end='') for a, b in zip(sorted(addbitsets(mangled).subtrees(phrasal), key=lambda n: min(n.leaves())), sorted(addbitsets(nopunct).subtrees(phrasal), key=lambda n: min(n.leaves()))): if fanout(a) != fanout(b): print(' '.join(sent)) print(mangled) print(nopunct) print(original) assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % ( fanout(a), fanout(b), a, b) print()
def __init__(self, trees, sentences): """Induce an instance of a grammar by using tree-sentence pairs. Parameters ---------- trees: list(Tree) The list of gold trees. sentences: list(list(str)) The list of sentences. The rules in the grammar have the following form: rule pos: (((lhs, 'Epsilon' ), (word, )), probability) rule other: (((lhs, rhs_1, ...), ((1, 2, 0), (0, ))), probability) """ # creating basic rules rules = treebankgrammar(trees, sentences) # creating general grammar info self.max_fanout = 1 self.max_length = 0 self.min_length = sys.maxsize for tree, sent in zip(trees, sentences): fanout_tree = fanout(tree) length_sent = len(sent) if fanout_tree > self.max_fanout and\ isinstance(tree, ImmutableTree): self.max_fanout = fanout_tree if length_sent > self.max_length: self.max_length = length_sent if length_sent < self.min_length: self.min_length = length_sent # calculating relative frequencies trules = [] crules = [] drules = [] for rule in rules: r = Rule(rule) if r.rhs[0] == "Epsilon": trules.append(r) elif len(r.yf) == 1: crules.append(r) else: drules.append(r) # assign values self.discontinuous_rules = drules self.continuous_rules = crules self.terminal_rules = trules