示例#1
0
def test_fragments():
    from discodop._fragments import getctrees, extractfragments, exactcounts
    treebank = """\
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\
	The cat saw the hungry dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The cat saw the dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The mouse saw the cat
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\
	The mouse saw the yellow cat
(S (NP (DT 0) (JJ 1) (NN 2)) (VP (VBP 3) (NP (DT 4) (NN 5))))\
	The little mouse saw the cat
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The cat ate the dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The mouse ate the cat""".splitlines()
    trees = [binarize(Tree(line.split('\t')[0])) for line in treebank]
    sents = [line.split('\t')[1].split() for line in treebank]
    for tree in trees:
        for n, idx in enumerate(tree.treepositions('leaves')):
            tree[idx] = n
    params = getctrees(zip(trees, sents))
    fragments = extractfragments(params['trees1'],
                                 0,
                                 0,
                                 params['vocab'],
                                 disc=True,
                                 approx=False)
    counts = exactcounts(list(fragments.values()), params['trees1'],
                         params['trees1'])
    assert len(fragments) == 25
    assert sum(counts) == 100
示例#2
0
def test_fragments():
	from discodop._fragments import getctrees, extractfragments, exactcounts
	treebank = """\
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\
	The cat saw the hungry dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The cat saw the dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The mouse saw the cat
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\
	The mouse saw the yellow cat
(S (NP (DT 0) (JJ 1) (NN 2)) (VP (VBP 3) (NP (DT 4) (NN 5))))\
	The little mouse saw the cat
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The cat ate the dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The mouse ate the cat""".splitlines()
	trees = [binarize(Tree(line.split('\t')[0])) for line in treebank]
	sents = [line.split('\t')[1].split() for line in treebank]
	for tree in trees:
		for n, idx in enumerate(tree.treepositions('leaves')):
			tree[idx] = n
	params = getctrees(zip(trees, sents))
	fragments = extractfragments(params['trees1'],
			0, 0, params['vocab'], disc=True, approx=False)
	counts = exactcounts(params['trees1'], params['trees1'],
			list(fragments.values()))
	assert len(fragments) == 25
	assert sum(counts) == 100
示例#3
0
def test_fragments():
	from discodop._fragments import getctrees, extractfragments, exactcounts
	treebank = [binarize(Tree(x)) for x in """\
(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (JJ yellow) (NN cat))))
(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN cat)) (VP (VBP ate) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))\
		""".splitlines()]
	sents = [tree.leaves() for tree in treebank]
	for tree in treebank:
		for n, idx in enumerate(tree.treepositions('leaves')):
			tree[idx] = n
	params = getctrees(treebank, sents)
	fragments = extractfragments(params['trees1'], params['sents1'],
			0, 0, params['labels'], discontinuous=True, approx=False)
	counts = exactcounts(params['trees1'], params['trees1'],
			list(fragments.values()))
	assert len(fragments) == 25
	assert sum(counts) == 100
	for (a, b), c in sorted(zip(fragments, counts), key=repr):
		print("%s\t%d" % (re.sub("[0-9]+", lambda x: b[int(x.group())], a), c))
示例#4
0
def initworkersimple(trees, sents, trees2=None, sents2=None):
	""" A simpler initialization for a worker in which a treebank has already
	been loaded. """
	PARAMS.update(getctrees(trees, sents, trees2, sents2))
	assert PARAMS['trees1']