def test_to_udf(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert from_string(s).to_udf(indent=None) == s assert from_string(s).to_udf(indent=1) == ('(1 some-thing -1 -1 -1\n' ' ("token"))') s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 3 "token [ +FORM \"a\" ]")) ' r'(4 bcd-lex 0.5 2 5 ("bcd" 5 "token [ +FORM \"bcd\" ]"))))') assert from_string(s).to_udf( indent=1) == ('(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a"\n' ' 3 "token [ +FORM \\"a\\" ]"))\n' ' (4 bcd-lex 0.5 2 5\n' ' ("bcd"\n' ' 5 "token [ +FORM \\"bcd\\" ]"))))') s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert from_string(s).to_udf( indent=1) == ('(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))') s = (r'(root (1 some-thing@some-type 0.4 0 5 (2 a-lex@a-type 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert from_string(s).to_udf( indent=1) == ('(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))')
def test_str(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert str(from_string(s)) == s s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 1 "token [ +FORM \"a\" ]")) ' r'(3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))))') assert str(from_string(s)) == s
def test_internals(self): a = from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert [t.id for t in a.internals()] == [None, 1] a = from_string('(root' ' (1 some-thing@some-type 0.4 0 5' ' (2 a-lex@a-type 0.8 0 1' ' ("a b"' ' 3 "token [ +FORM \\"a\\" ]"' ' 4 "token [ +FORM \\"b\\" ]"))' ' (5 b-lex@b-type 0.9 1 2' ' ("b"' ' 6 "token [ +FORM \\"b\\" ]"))))') assert [t.id for t in a.internals()] == [None, 1]
def test_terminals(self): a = from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert [t.form for t in a.terminals()] == ['a', 'b'] a = from_string('(root' ' (1 some-thing@some-type 0.4 0 5' ' (2 a-lex@a-type 0.8 0 1' ' ("a b"' ' 3 "token [ +FORM \\"a\\" ]"' ' 4 "token [ +FORM \\"b\\" ]"))' ' (5 b-lex@b-type 0.9 1 2' ' ("b"' ' 6 "token [ +FORM \\"b\\" ]"))))') assert [t.form for t in a.terminals()] == ['a b', 'b']
def derivation(self): """ Interpret and return a Derivation object. If :mod:`delphin.derivation` is available and the value of the `derivation` key in the result dictionary is a valid UDF string or a dictionary, return the interpeted Derivation object. If there is no 'derivation' key in the result, return `None`. Raises: InterfaceError: when the value is an unsupported type or :mod:`delphin.derivation` is unavailable """ drv = self.get('derivation') try: from delphin import derivation if isinstance(drv, dict): drv = derivation.from_dict(drv) elif isinstance(drv, str): drv = derivation.from_string(drv) elif drv is not None: raise TypeError(drv.__class__.__name__) except (ImportError, TypeError) as exc: raise InterfaceError('can not get Derivation object') from exc return drv
def test_type(self): a = from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert a.type is None node = a.daughters[0] assert node.type is None assert node.daughters[0].type is None assert node.daughters[1].type is None a = from_string('(root (1 some-thing@some-type -1 -1 -1' ' (2 a-thing@a-type -1 -1 -1 ("a"))' ' (3 b-thing@b-type -1 -1 -1 ("b"))))') assert a.type is None node = a.daughters[0] assert node.type == 'some-type' assert node.daughters[0].type == 'a-type' assert node.daughters[1].type == 'b-type'
def test_entity(self): a = from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert a.entity == 'root' node = a.daughters[0] assert node.entity == 'some-thing' assert node.daughters[0].entity == 'a-thing' assert node.daughters[1].entity == 'b-thing' a = from_string('(root (1 some-thing@some-type -1 -1 -1' ' (2 a-thing@a-type -1 -1 -1 ("a"))' ' (3 b-thing@b-type -1 -1 -1 ("b"))))') assert a.entity == 'root' node = a.daughters[0] assert node.entity == 'some-thing' assert node.daughters[0].entity == 'a-thing' assert node.daughters[1].entity == 'b-thing'
def test_to_udx(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert from_string(s).to_udx(indent=None) == s s = (r'(root (1 some-thing@some-type 0.4 0 5 ' r'(2 a-lex@a-type 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]")) ' r'(5 b-lex@b-type 0.9 1 2 ' r'("b" 6 "token [ +FORM \"b\" ]"))))') assert from_string(s).to_udx( indent=1) == ('(root\n' ' (1 some-thing@some-type 0.4 0 5\n' ' (2 a-lex@a-type 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))\n' ' (5 b-lex@b-type 0.9 1 2\n' ' ("b"\n' ' 6 "token [ +FORM \\"b\\" ]"))))')
def test_is_head(self): # NOTE: is_head() is undefined for nodes with multiple # siblings, none of which are marked head (e.g. in plain UDF) a = from_string('(root (1 some-thing -1 -1 -1' ' (2 some-thing -1 -1 -1 ("a"))' ' (3 some-thing -1 -1 -1 ("b"))))') assert a.is_head() node = a.daughters[0] assert node.is_head() assert node.daughters[0].is_head() is None assert node.daughters[1].is_head() is None # if one sibling is marked, all become decidable a = from_string('(root (1 some-thing -1 -1 -1' ' (2 some-thing -1 -1 -1 ("a"))' ' (3 ^some-thing -1 -1 -1 ("b"))))') assert a.is_head() node = a.daughters[0] assert node.is_head() assert not node.daughters[0].is_head() assert node.daughters[1].is_head()
def test_from_dict(self): s = '(root (1 some-thing -1 -1 -1 ("a")))' d = { 'entity': 'root', 'daughters': [{ 'id': 1, 'entity': 'some-thing', 'form': 'a' }] } assert from_dict(d) == from_string(s) s = (r'(root (1 ^some-thing@some-type -1 -1 -1 ("a b"' r' 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]")))') d = { 'entity': 'root', 'daughters': [{ 'id': 1, 'entity': 'some-thing', 'type': 'some-type', 'head': True, 'form': 'a b', 'tokens': [{ 'id': 2, 'tfs': r'token [ +FORM \"a\" ]' }, { 'id': 3, 'tfs': r'token [ +FORM \"b\" ]' }] }] } assert from_dict(d) == from_string(s)
def _transform_derivation(s): return derivation.from_string(s).to_dict()
def test_fromstring(self): with pytest.raises(DerivationSyntaxError): from_string('') # root with no children # TODO: this should be a DerivationSyntaxError but the current # UDF parser doesn't make that straightforward. Revist this # when/if the UDF parsing changes with pytest.raises(ValueError): from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(DerivationSyntaxError): from_string(' (1 some-thing -1 -1 -1 ("token"))') with pytest.raises(DerivationSyntaxError): from_string(' (1 some-thing -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(DerivationSyntaxError): from_string('(1 some-thing -1 -1 -1 ("token")') # ok t = from_string('(1 some-thing -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # newlines in tree t = from_string('''(1 some-thing -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # LKB-style terminals t = from_string('''(1 some-thing -1 -1 -1 ("to ken" 1 2))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('to ken')] # start/end ignored # TFS-style terminals t = from_string(r'''(1 some-thing -1 -1 -1 ("to ken" 2 "token [ +FORM \"to\" ]" 3 "token [ +FORM \"ken\" ]"))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [ T('to ken', [ Tk(2, r'token [ +FORM \"to\" ]'), Tk(3, r'token [ +FORM \"ken\" ]') ]) ] # longer example t = from_string(r'''(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-thing' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [T('a', [Tk(1, r'token [ +FORM \"a\" ]')])] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [T('bcd', [Tk(2, r'token [ +FORM \"bcd\" ]')])]
def test_to_dict(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert from_string(s).to_dict() == { 'id': 1, 'entity': 'some-thing', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'token' } fields = ('id', 'entity', 'score') # daughters and form are always shown assert from_string(s).to_dict(fields=fields) == { 'id': 1, 'entity': 'some-thing', 'score': -1.0, 'form': 'token' } s = (r'(root (0 top@top-rule -1 -1 -1' r' (1 a-lex@a-type -1 -1 -1 ("a b" 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]"))' r' (4 ^c-lex@c-type -1 -1 -1 ("c" 5 "token [ +FORM \"c\" ]"))))') assert from_string(s).to_dict() == { 'entity': 'root', 'daughters': [{ 'id': 0, 'entity': 'top', 'type': 'top-rule', 'score': -1.0, 'start': -1, 'end': -1, 'daughters': [{ 'id': 1, 'entity': 'a-lex', 'type': 'a-type', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'a b', 'tokens': [{ 'id': 2, 'tfs': r'token [ +FORM \"a\" ]' }, { 'id': 3, 'tfs': r'token [ +FORM \"b\" ]' }] }, { 'id': 4, 'entity': 'c-lex', 'type': 'c-type', 'head': True, 'score': -1.0, 'start': -1, 'end': -1, 'form': 'c', 'tokens': [{ 'id': 5, 'tfs': r'token [ +FORM \"c\" ]' }] }] }] } assert from_string(s).to_dict(fields=fields) == { 'entity': 'root', 'daughters': [{ 'id': 0, 'entity': 'top', 'score': -1.0, 'daughters': [{ 'id': 1, 'entity': 'a-lex', 'score': -1.0, 'form': 'a b' }, { 'id': 4, 'entity': 'c-lex', 'score': -1.0, 'form': 'c' }] }] }
def test_is_root(self): a = from_string('(1 some-thing -1 -1 -1 ("token"))') assert not a.is_root() a = from_string('(root (1 some-thing -1 -1 -1 ("token")))') assert a.is_root() assert not a.daughters[0].is_root()
def test_eq(self): a = from_string('(1 some-thing -1 -1 -1 ("token"))') # identity b = from_string('(1 some-thing -1 -1 -1 ("token"))') assert a == b # ids and scores don't matter b = from_string('(100 some-thing 0.114 -1 -1 ("token"))') assert a == b # tokens matter b = from_string('(1 some-thing -1 -1 -1 ("nekot"))') assert a != b # and type of rhs assert a != '(1 some-thing -1 -1 -1 ("token"))' # and tokenization b = from_string('(1 some-thing -1 2 7 ("token"))') assert a != b # and of course entities b = from_string('(1 epyt-emos -1 -1 -1 ("token"))') assert a != b # and number of children a = from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")))') b = from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') assert a != b # and order of children a = from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') b = from_string( '(1 x -1 -1 -1 (3 z -1 -1 -1 ("z")) (2 y -1 -1 -1 ("y")))') assert a != b # and UDX properties when specified a = from_string( '(1 x -1 -1 -1 (2 ^y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') b = from_string( '(1 x -1 -1 -1 (2 ^y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') assert a == b b = from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 ^z -1 -1 -1 ("z")))') assert a != b b = from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') assert a != b a = from_string('(1 some-thing@some-type -1 -1 -1 ("token"))') b = from_string('(1 some-thing@some-type -1 -1 -1 ("token"))') assert a == b b = from_string('(1 some-thing@another-type -1 -1 -1 ("token"))') assert a != b b = from_string('(1 some-thing -1 -1 -1 ("token"))') assert a != b
def read_profile(input_dir, output_dir, profile_name, mrp_eds, lexicon, args): ts = d_itsdb.TestSuite(input_dir) derivation_strs = [] supertag_strs = [] dmrs_json_strs = [] for iid, sentence, parse_tokens, result_derivation, result_mrs in d_tsql.select('i-id i-input p-tokens derivation mrs', ts): tokens_rep = d_tokens.YYTokenLattice.from_string(parse_tokens) token_dict = {tok.id : tok for tok in tokens_rep.tokens} derivation_rep = d_derivation.from_string(result_derivation) assert len(derivation_rep.daughters) == 1 derivation_rep = derivation_rep.daughters[0] if mrp_eds: if iid in mrp_eds: try: eds_rep = dc_eds.decode(mrp_eds[iid]) dmrs_rep = eds_to_dmrs(eds_rep) except d_eds._exceptions.EDSSyntaxError: #print("Skipping: EDS syntax error", mrp_eds[iid]) continue else: #print("Unmatched:", iid) continue else: try: mrs_rep = dc_simplemrs.decode(result_mrs) except d_mrs._exceptions.MRSSyntaxError: #print("Skipping: MRS syntax error", result_mrs) continue dmrs_rep = d_dmrs.from_mrs(mrs_rep) mr = semantics.SemanticRepresentation(profile_name + ":" + iid, sentence, token_dict, derivation_rep, lexicon) # read derivation tree if args.convert_semantics: mr.map_dmrs(dmrs_rep) mr.process_semantic_tree(mr.root_node_id, dmrs_rep) mr.print_mrs() if args.extract_syntax: derivation_strs.append(mr.derivation_tree_str(mr.root_node_id, newline=False).lstrip()) supertag_strs.append(mr.supertag_str(mr.root_node_id).strip()) if args.extract_semantics: dmrs_json_strs.append(mr.dmrs_json_str(dmrs_rep)) if args.extract_syntax: with open(output_dir + ".tree", 'w') as dt_out: for s in derivation_strs: dt_out.write(s + "\n") with open(output_dir + ".tags", 'w') as st_out: for s in supertag_strs: st_out.write(s + "\n") if args.extract_semantics: with open(output_dir + ".dmrs", 'w') as d_out: for s in dmrs_json_strs: if s != "": d_out.write(s + "\n")