def test_eq(self): a = D.from_string('(1 some-type -1 -1 -1 ("token"))') # identity b = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert a == b # ids and scores don't matter b = D.from_string('(100 some-type 0.114 -1 -1 ("token"))') assert a == b # tokens matter b = D.from_string('(1 some-type -1 -1 -1 ("nekot"))') assert a != b # and type of rhs assert a != '(1 some-type -1 -1 -1 ("token"))' # and tokenization b = D.from_string('(1 some-type -1 2 7 ("token"))') assert a != b # and of course entities b = D.from_string('(1 epyt-emos -1 -1 -1 ("token"))') assert a != b # and number of children a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")))') b = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') assert a != b # and order of children a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') b = D.from_string('(1 x -1 -1 -1 (3 z -1 -1 -1 ("z")) (2 y -1 -1 -1 ("y")))') assert a != b
def test_str(self): s = '(1 some-type -1 -1 -1 ("token"))' assert str(D.from_string(s)) == s s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 1 "token [ +FORM \"a\" ]")) ' r'(3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))))') assert str(D.from_string(s)) == s
def test_str(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert str(D.from_string(s)) == s s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 1 "token [ +FORM \"a\" ]")) ' r'(3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))))') assert str(D.from_string(s)) == s
def test_from_dict(self): s = '(root (1 some-type -1 -1 -1 ("a")))' d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-type', 'form': 'a' } ] } assert D.from_dict(d) == D.from_string(s) s = ( r'(root (1 some-type -1 -1 -1 ("a b"' r' 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]")))' ) d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-type', 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] } ] } assert D.from_dict(d) == D.from_string(s)
def test_from_dict(self): s = '(root (1 some-thing -1 -1 -1 ("a")))' d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-thing', 'form': 'a' } ] } assert D.from_dict(d) == D.from_string(s) s = (r'(root (1 ^some-thing@some-type -1 -1 -1 ("a b"' r' 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]")))' ) d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-thing', 'type': 'some-type', 'head': True, 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] } ] } assert D.from_dict(d) == D.from_string(s)
def test_to_udf(self): s = '(1 some-type -1 -1 -1 ("token"))' assert D.from_string(s).to_udf(indent=None) == s assert D.from_string(s).to_udf(indent=1) == ( '(1 some-type -1 -1 -1\n' ' ("token"))' ) s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 3 "token [ +FORM \"a\" ]")) ' r'(4 bcd-lex 0.5 2 5 ("bcd" 5 "token [ +FORM \"bcd\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-type 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a"\n' ' 3 "token [ +FORM \\"a\\" ]"))\n' ' (4 bcd-lex 0.5 2 5\n' ' ("bcd"\n' ' 5 "token [ +FORM \\"bcd\\" ]"))))' ) s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-type 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))' )
def test_eq(self): a = D.from_string('(1 some-type -1 -1 -1 ("token"))') # identity b = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert a == b # ids and scores don't matter b = D.from_string('(100 some-type 0.114 -1 -1 ("token"))') assert a == b # tokens matter b = D.from_string('(1 some-type -1 -1 -1 ("nekot"))') assert a != b # and type of rhs assert a != '(1 some-type -1 -1 -1 ("token"))' # and tokenization b = D.from_string('(1 some-type -1 2 7 ("token"))') assert a != b # and of course entities b = D.from_string('(1 epyt-emos -1 -1 -1 ("token"))') assert a != b # and number of children a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")))') b = D.from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') assert a != b # and order of children a = D.from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') b = D.from_string( '(1 x -1 -1 -1 (3 z -1 -1 -1 ("z")) (2 y -1 -1 -1 ("y")))') assert a != b
def test_lexical_type(self): # NOTE: this returns None for standard UDF or non-preterminals a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))' ' (2 b-type -1 -1 -1 ("b")))') assert a.lexical_type() == None assert a.daughters[0].lexical_type() == None assert a.daughters[1].lexical_type() == None a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))' ' (2 b-type@b-type_le -1 -1 -1 ("b")))') assert a.lexical_type() == None assert a.daughters[0].lexical_type() == 'a-type_le' assert a.daughters[1].lexical_type() == 'b-type_le'
def test_basic_entity(self): # this works for both UDX and standard UDF a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))' ' (2 b-type -1 -1 -1 ("b")))') assert a.basic_entity() == 'root' assert a.daughters[0].basic_entity() == 'a-type' assert a.daughters[1].basic_entity() == 'b-type' a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))' ' (2 b-type@b-type_le -1 -1 -1 ("b")))') assert a.basic_entity() == 'root' assert a.daughters[0].entity == 'a-type@a-type_le' assert a.daughters[0].basic_entity() == 'a-type' assert a.daughters[1].entity == 'b-type@b-type_le' assert a.daughters[1].basic_entity() == 'b-type'
def test_terminals(self): a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert [t.form for t in a.terminals()] == ['a', 'b'] a = D.from_string('(root' ' (1 some-thing@some-type 0.4 0 5' ' (2 a-lex@a-type 0.8 0 1' ' ("a b"' ' 3 "token [ +FORM \\"a\\" ]"' ' 4 "token [ +FORM \\"b\\" ]"))' ' (5 b-lex@b-type 0.9 1 2' ' ("b"' ' 6 "token [ +FORM \\"b\\" ]"))))') assert [t.form for t in a.terminals()] == ['a b', 'b']
def test_is_head(self): # NOTE: is_head() is undefined for standard UDF without the # head marker ^ a = D.from_string('(root (1 some-type -1 -1 -1 ("a"))' ' (2 ^some-type -1 -1 -1 ("b")))') assert a.daughters[0].is_head() == False assert a.daughters[1].is_head() == True
def test_type(self): a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert a.type == None node = a.daughters[0] assert node.type == None assert node.daughters[0].type == None assert node.daughters[1].type == None a = D.from_string('(root (1 some-thing@some-type -1 -1 -1' ' (2 a-thing@a-type -1 -1 -1 ("a"))' ' (3 b-thing@b-type -1 -1 -1 ("b"))))') assert a.type == None node = a.daughters[0] assert node.type == 'some-type' assert node.daughters[0].type == 'a-type' assert node.daughters[1].type == 'b-type'
def test_entity(self): a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert a.entity == 'root' node = a.daughters[0] assert node.entity == 'some-thing' assert node.daughters[0].entity == 'a-thing' assert node.daughters[1].entity == 'b-thing' a = D.from_string('(root (1 some-thing@some-type -1 -1 -1' ' (2 a-thing@a-type -1 -1 -1 ("a"))' ' (3 b-thing@b-type -1 -1 -1 ("b"))))') assert a.entity == 'root' node = a.daughters[0] assert node.entity == 'some-thing' assert node.daughters[0].entity == 'a-thing' assert node.daughters[1].entity == 'b-thing'
def preprocess(inp, derivation): derivation = Derivation.from_string(derivation) tokens = get_tokens(derivation) traces = [i[1] for i in tokens] sent = [] for token, trace in tokens: lemma = trace.split('/')[-3] #native entry if not lemma.startswith('generic'): to = int(re.search(r'\+TO .*?\\"(\d+)\\"', token.tfs).group(1)) fro = int(re.search(r'\+FROM .*?\\"(\d+)\\"', token.tfs).group(1)) form = inp[fro:to] else: form = lemma #add punctuation if 'comma' in trace: form = '%s,' % form if 'asterisk_' in trace: form = '%s*' % form if 'asterisk-pre' in trace: form = '*%s' % form if 'threedot' in trace: form = '%s...' % form if 'hyphen' in trace: form = '%s-' % form if 'sqright' in trace: form = '%s\'' % form if 'sqleft' in trace: form = '\'%s' % form if 'dqright' in trace: form = '%s\'' % form if 'dqleft' in trace: form = '\'%s' % form if 'rparen' in trace: form = '%s)' % form if 'lparen' in trace: form = '(%s' % form if 'comma-rp' in trace: form = '%s,)' % form if 'bang' in trace: form = '%s!' % form if 'qmark' in trace: form = '%s?' % form if 'qmark-bang' in trace: form = '%s?!' % form if 'period' in trace: form = '%s.' % form #fix compounds if '-' in form and form[-1] != '-': form = form.split('-')[1] sent.append(form) return ' '.join(sent)
def test_lexical_type(self): # NOTE: this returns None for standard UDF or non-preterminals a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.lexical_type() == None node = a.daughters[0] assert node.daughters[0].lexical_type() == None assert node.daughters[1].lexical_type() == None a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing@a-type_le -1 -1 -1 ("a"))' ' (3 b-thing@b-type_le -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.lexical_type() == None node = a.daughters[0] assert node.daughters[0].lexical_type() == 'a-type_le' assert node.daughters[1].lexical_type() == 'b-type_le'
def test_basic_entity(self): # this works for both UDX and standard UDF a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.basic_entity() == 'root' node = a.daughters[0] assert node.daughters[0].basic_entity() == 'a-thing' assert node.daughters[1].basic_entity() == 'b-thing' a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing@a-type_le -1 -1 -1 ("a"))' ' (3 b-thing@b-type_le -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.basic_entity() == 'root' node = a.daughters[0] assert node.basic_entity() == 'some-thing' assert node.daughters[0].basic_entity() == 'a-thing' assert node.daughters[1].basic_entity() == 'b-thing'
def test_to_udx(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert D.from_string(s).to_udx(indent=None) == s s = (r'(root (1 some-thing@some-type 0.4 0 5 ' r'(2 a-lex@a-type 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]")) ' r'(5 b-lex@b-type 0.9 1 2 ' r'("b" 6 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udx(indent=1) == ( '(root\n' ' (1 some-thing@some-type 0.4 0 5\n' ' (2 a-lex@a-type 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))\n' ' (5 b-lex@b-type 0.9 1 2\n' ' ("b"\n' ' 6 "token [ +FORM \\"b\\" ]"))))' )
def test_is_head(self): # NOTE: is_head() is undefined for nodes with multiple # siblings, none of which are marked head (e.g. in plain UDF) a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 some-thing -1 -1 -1 ("a"))' ' (3 some-thing -1 -1 -1 ("b"))))') assert a.is_head() == True node = a.daughters[0] assert node.is_head() == True assert node.daughters[0].is_head() == None assert node.daughters[1].is_head() == None # if one sibling is marked, all become decidable a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 some-thing -1 -1 -1 ("a"))' ' (3 ^some-thing -1 -1 -1 ("b"))))') assert a.is_head() == True node = a.daughters[0] assert node.is_head() == True assert node.daughters[0].is_head() == False assert node.daughters[1].is_head() == True
def test_to_udf(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert D.from_string(s).to_udf(indent=None) == s assert D.from_string(s).to_udf(indent=1) == ( '(1 some-thing -1 -1 -1\n' ' ("token"))' ) s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 3 "token [ +FORM \"a\" ]")) ' r'(4 bcd-lex 0.5 2 5 ("bcd" 5 "token [ +FORM \"bcd\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a"\n' ' 3 "token [ +FORM \\"a\\" ]"))\n' ' (4 bcd-lex 0.5 2 5\n' ' ("bcd"\n' ' 5 "token [ +FORM \\"bcd\\" ]"))))' ) s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))' ) s = (r'(root (1 some-thing@some-type 0.4 0 5 (2 a-lex@a-type 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))' )
def derivation(self): """ Deserialize and return a Derivation object for UDF- or JSON-formatted derivation data; otherwise return the original string. """ drv = self.get('derivation') if drv is not None: if isinstance(drv, dict): drv = Derivation.from_dict(drv) elif isinstance(drv, stringtypes): drv = Derivation.from_string(drv) return drv
def test_to_dict(self): s = '(1 some-type -1 -1 -1 ("token"))' assert D.from_string(s).to_dict() == { 'id': 1, 'entity': 'some-type', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'token' } fields = ('id', 'entity', 'score') # daughters and form are always shown assert D.from_string(s).to_dict(fields=fields) == { 'id': 1, 'entity': 'some-type', 'score': -1.0, 'form': 'token' } s = ( r'(1 a-lex -1 -1 -1 ("a b" 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]"))' ) assert D.from_string(s).to_dict() == { 'id': 1, 'entity': 'a-lex', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] } assert D.from_string(s).to_dict(fields=fields) == { 'id': 1, 'entity': 'a-lex', 'score': -1.0, 'form': 'a b' }
def parse_spans(span_lines, derivation_str): regex = re.compile(r"\((\d+), \d+, \d+, <(\d+):(\d+)>") c_spans = {} for line in span_lines.split("\n"): m = regex.search(line) if not m: continue key, start, end = m.groups() c_spans[int(key)] = (int(start), int(end)) derivation = Derivation.from_string(derivation_str) # type: Derivation # return [c_spans[j.id] for i in derivation.terminals() # for j in i.tokens] return [(c_spans[i.tokens[0].id][0], c_spans[i.tokens[-1].id][1]) for i in derivation.terminals()]
def prof_entries(prof, typemap, lexmap, table='result', cols=('derivation', 'mrs')): p = itsdb.ItsdbProfile(prof) seen = set() for derivation, mrs in p.select(table, cols): d = Derivation.from_string(derivation) for entity, typ, form in _derivation_les(d): if typ is None: typ = lexmap.get(entity) orth = ', '.join('"{}"'.format(part) for part in form) if (typ, orth) not in seen and typ in typemap: supertype = typemap[typ][0] # more than 1? lename = '+'.join(form) + '-' + supertype pred = None print(lename, supertype, orth, pred, None) yield (lename, supertype, orth, pred, None) seen.add((typ, orth))
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-type -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # newlines in tree t = D.from_string('''(1 some-type -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # LKB-style terminals t = D.from_string('''(1 some-type -1 -1 -1 ("to ken" 1 2))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('to ken')] # start/end ignored # TFS-style terminals t = D.from_string(r'''(1 some-type -1 -1 -1 ("to ken" 2 "token [ +FORM \"to\" ]" 3 "token [ +FORM \"ken\" ]"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [ T('to ken', [Tk(2, r'token [ +FORM \"to\" ]'), Tk(3, r'token [ +FORM \"ken\" ]')]) ] # longer example t = D.from_string(r'''(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-type' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [T('a', [Tk(1, r'token [ +FORM \"a\" ]')])] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [T('bcd', [Tk(2, r'token [ +FORM \"bcd\" ]')])]
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-type -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"', )] # newlines in tree t = D.from_string('''(1 some-type -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"', )] # longer example t = D.from_string(r'''(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-type' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [('"a"', '1', r'"token [ +FORM \"a\" ]"')] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [('"bcd"', '2', r'"token [ +FORM \"bcd\" ]"')]
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-type -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"',)] # newlines in tree t = D.from_string('''(1 some-type -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"',)] # longer example t = D.from_string(r'''(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-type' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [('"a"', '1', r'"token [ +FORM \"a\" ]"')] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [('"bcd"', '2', r'"token [ +FORM \"bcd\" ]"')]
def test_is_root(self): a = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert a.is_root() == False a = D.from_string('(root (1 some-type -1 -1 -1 ("token")))') assert a.is_root() == True assert a.daughters[0].is_root() == False
def test_is_root(self): a = D.from_string('(1 some-thing -1 -1 -1 ("token"))') assert a.is_root() == False a = D.from_string('(root (1 some-thing -1 -1 -1 ("token")))') assert a.is_root() == True assert a.daughters[0].is_root() == False
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-thing -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-thing -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-thing -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-thing -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # newlines in tree t = D.from_string('''(1 some-thing -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # LKB-style terminals t = D.from_string('''(1 some-thing -1 -1 -1 ("to ken" 1 2))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('to ken')] # start/end ignored # TFS-style terminals t = D.from_string(r'''(1 some-thing -1 -1 -1 ("to ken" 2 "token [ +FORM \"to\" ]" 3 "token [ +FORM \"ken\" ]"))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [ T('to ken', [Tk(2, r'token [ +FORM \"to\" ]'), Tk(3, r'token [ +FORM \"ken\" ]')]) ] # longer example t = D.from_string(r'''(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-thing' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [T('a', [Tk(1, r'token [ +FORM \"a\" ]')])] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [T('bcd', [Tk(2, r'token [ +FORM \"bcd\" ]')])]
def test_to_dict(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert D.from_string(s).to_dict() == { 'id': 1, 'entity': 'some-thing', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'token' } fields = ('id', 'entity', 'score') # daughters and form are always shown assert D.from_string(s).to_dict(fields=fields) == { 'id': 1, 'entity': 'some-thing', 'score': -1.0, 'form': 'token' } s = (r'(root (0 top@top-rule -1 -1 -1' r' (1 a-lex@a-type -1 -1 -1 ("a b" 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]"))' r' (4 ^c-lex@c-type -1 -1 -1 ("c" 5 "token [ +FORM \"c\" ]"))))') assert D.from_string(s).to_dict() == { 'entity': 'root', 'daughters': [ { 'id': 0, 'entity': 'top', 'type': 'top-rule', 'score': -1.0, 'start': -1, 'end': -1, 'daughters': [ { 'id': 1, 'entity': 'a-lex', 'type': 'a-type', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] }, { 'id': 4, 'entity': 'c-lex', 'type': 'c-type', 'head': True, 'score': -1.0, 'start': -1, 'end': -1, 'form': 'c', 'tokens': [ {'id': 5, 'tfs': r'token [ +FORM \"c\" ]'} ] } ] } ] } assert D.from_string(s).to_dict(fields=fields) == { 'entity': 'root', 'daughters': [ { 'id': 0, 'entity': 'top', 'score': -1.0, 'daughters': [ { 'id': 1, 'entity': 'a-lex', 'score': -1.0, 'form': 'a b' }, { 'id': 4, 'entity': 'c-lex', 'score': -1.0, 'form': 'c' } ] } ] }