Пример #1
0
def test_optimalbinarize():
	"""Verify that all optimal parsing complexities are lower than or
	equal to the complexities of right-to-left binarizations."""
	from discodop.treetransforms import optimalbinarize, complexityfanout
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, (tree, sent) in enumerate(zip(list(
			corpus.trees().values())[:-2000], corpus.sents().values())):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('opt. bin. violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Пример #2
0
def test_optimalbinarize():
	"""Verify that all optimal parsing complexities are lower than or
	equal to the complexities of right-to-left binarizations."""
	from discodop.treetransforms import optimalbinarize, complexityfanout
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, (tree, sent) in enumerate(zip(list(
			corpus.trees().values())[:-2000], corpus.sents().values())):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('opt. bin. violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Пример #3
0
def treebankfanout(trees):
	""" Get maximal fan-out of a list of trees. """
	# avoid max over empty sequence: 'treebank' may only have unary productions
	try:
		return max((fanout(a), n) for n, tree in enumerate(trees)
				for a in addbitsets(tree).subtrees(lambda x: len(x) > 1))
	except ValueError:
		return 1, 0
Пример #4
0
def treebankfanout(trees):
	"""Get maximal fan-out of a list of trees."""
	from discodop.treetransforms import addbitsets, fanout
	try:  # avoid max over empty sequence: 'treebank' may only have unary prods
		return max((fanout(a), n) for n, tree in enumerate(trees)
				for a in addbitsets(tree).subtrees(lambda x: len(x) > 1))
	except ValueError:
		return 1, 0
Пример #5
0
def test_punct():
    """Verify that punctuation movement does not increase fan-out."""
    def phrasal(x):
        return x and isinstance(x[0], Tree)

    from discodop.treebank import NegraCorpusReader
    filename = 'alpinosample.export'
    mangledtrees = NegraCorpusReader(filename, punct='move')
    nopunct = list(
        NegraCorpusReader(filename, punct='remove').trees().values())
    originals = list(
        NegraCorpusReader(filename, headrules=None,
                          encoding='iso-8859-1').trees().values())
    for n, mangled, sent, nopunct, original in zip(
            count(),
            mangledtrees.trees().values(),
            mangledtrees.sents().values(), nopunct, originals):
        print(n, end='. ')
        for a, b in zip(
                sorted(addbitsets(mangled).subtrees(phrasal),
                       key=lambda n: min(n.leaves())),
                sorted(addbitsets(nopunct).subtrees(phrasal),
                       key=lambda n: min(n.leaves()))):
            if fanout(a) != fanout(b):
                print(' '.join(sent))
                print(mangled)
                print(nopunct)
                print(original)
            assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % (fanout(a),
                                                              fanout(b), a, b)
    print()
Пример #6
0
def testpunct():
	""" Verify that punctuation movement does not increase fan-out. """
	from discodop.treetransforms import addbitsets, fanout
	from discodop.treebank import NegraCorpusReader
	filename = 'alpinosample.export'
	mangledtrees = NegraCorpusReader('.', filename, punct='move')
	nopunct = list(NegraCorpusReader('.', filename,
			punct='remove').parsed_sents().values())
	originals = list(NegraCorpusReader('.', filename, headrules=None,
			encoding='iso-8859-1').parsed_sents().values())
	phrasal = lambda x: len(x) and isinstance(x[0], Tree)
	for n, mangled, sent, nopunct, original in zip(count(),
			mangledtrees.parsed_sents().values(),
			mangledtrees.sents().values(), nopunct, originals):
		print(n, end='')
		for a, b in zip(sorted(addbitsets(mangled).subtrees(phrasal),
				key=lambda n: min(n.leaves())),
				sorted(addbitsets(nopunct).subtrees(phrasal),
				key=lambda n: min(n.leaves()))):
			if fanout(a) != fanout(b):
				print(' '.join(sent))
				print(mangled)
				print(nopunct)
				print(original)
			assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % (
				fanout(a), fanout(b), a, b)
	print()
Пример #7
0
    def __init__(self, trees, sentences):
        """Induce an instance of a grammar by using tree-sentence pairs.

        Parameters
        ----------
        trees: list(Tree)
            The list of gold trees.
        sentences: list(list(str))
            The list of sentences.

        The rules in the grammar have the following form:
        rule pos:   (((lhs, 'Epsilon' ), (word,           )), probability)
        rule other: (((lhs, rhs_1, ...), ((1, 2, 0), (0, ))), probability)

        """
        # creating basic rules
        rules = treebankgrammar(trees, sentences)

        # creating general grammar info
        self.max_fanout = 1
        self.max_length = 0
        self.min_length = sys.maxsize
        for tree, sent in zip(trees, sentences):
            fanout_tree = fanout(tree)
            length_sent = len(sent)
            if fanout_tree > self.max_fanout and\
               isinstance(tree, ImmutableTree):
                self.max_fanout = fanout_tree
            if length_sent > self.max_length:
                self.max_length = length_sent
            if length_sent < self.min_length:
                self.min_length = length_sent

        # calculating relative frequencies
        trules = []
        crules = []
        drules = []
        for rule in rules:
            r = Rule(rule)
            if r.rhs[0] == "Epsilon":
                trules.append(r)
            elif len(r.yf) == 1:
                crules.append(r)
            else:
                drules.append(r)

        # assign values
        self.discontinuous_rules = drules
        self.continuous_rules = crules
        self.terminal_rules = trules