def _read_ace_parse(s): from delphin.mrs import simplemrs if hasattr(s, 'decode'): s = s.decode('utf-8') surface = None newline = False for line in s.splitlines(): if line.startswith('SENT: '): surface = line[6:] # regular ACE output elif line.startswith('['): m = line.partition(' ; ')[0].strip() m = simplemrs.loads(m, single=True) m.surface = surface yield m # with --tsdb-stdout elif line.startswith('('): while line: expr = SExpr.parse(line) line = expr.remainder.lstrip() if len(expr.data) == 2 and expr.data[0] == ':results': for result in expr.data[1]: for key, val in result: if key == ':mrs': yield simplemrs.loads(val, single=True) elif line == '\n': if newline: surface = None newline = False else: newline = True else: pass
def _tsdb_response(response, line): while line: expr = SExpr.parse(line) line = expr.remainder if len(expr.data) != 2: logging.error('Malformed output from ACE: {}'.format(line)) break key, val = expr.data if key == ':p-input': response.setdefault('tokens', {})['initial'] = val.strip() elif key == ':p-tokens': response.setdefault('tokens', {})['internal'] = val.strip() elif key == ':results': for result in val: res = {} for reskey, resval in result: if reskey == ':derivation': res['DERIV'] = resval.strip() elif reskey == ':mrs': res['MRS'] = resval.strip() elif reskey == ':surface': res['SENT'] = resval.strip() elif isinstance(resval, stringtypes): res[reskey[1:]] = resval.strip() else: res[reskey[1:]] = resval response['RESULTS'].append(res) elif isinstance(val, stringtypes): response[key[1:]] = val.strip() else: response[key[1:]] = val return response
def _decode(lines): surface = None newline = False for line in lines: if line.startswith('SENT: '): surface = line[6:].rstrip() # regular ACE output elif line.startswith('['): m = line.partition(' ; ')[0].strip() m = simplemrs.decode(m) m.surface = surface yield m # with --tsdb-stdout elif line.startswith('('): while line: data, remainder = SExpr.parse(line) line = remainder.lstrip() if len(data) == 2 and data[0] == ':results': for result in data[1]: for key, val in result: if key == ':mrs': yield simplemrs.decode(val) elif line == '\n': if newline: surface = None newline = False else: newline = True else: pass
def _sexpr_data(line): while line: expr = SExpr.parse(line) if len(expr.data) != 2: logging.error('Malformed output from ACE: {}'.format(line)) break line = expr.remainder.lstrip() yield expr.data
def map(self, response): """ Process *response* and return a list of (table, rowdata) tuples. """ inserts = [] parse = {} # custom remapping, cleanup, and filling in holes parse['i-id'] = response.get('keys', {}).get('i-id', -1) self._parse_id = max(self._parse_id + 1, parse['i-id']) parse['parse-id'] = self._parse_id parse['run-id'] = response.get('run', {}).get('run-id', -1) if 'tokens' in response: parse['p-input'] = response['tokens'].get('initial') parse['p-tokens'] = response['tokens'].get('internal') if 'ninputs' not in response: toks = response.tokens('initial') if toks is not None: response['ninputs'] = len(toks.tokens) if 'ntokens' not in response: toks = response.tokens('internal') if toks is not None: response['ntokens'] = len(toks.tokens) if 'readings' not in response and 'results' in response: response['readings'] = len(response['results']) # basic mapping for key in self._parse_keys: if key in response: parse[key] = response[key] inserts.append(('parse', parse)) for result in response.get('results', []): d = {'parse-id': self._parse_id} if 'flags' in result: d['flags'] = SExpr.format(result['flags']) for key in self._result_keys: if key in result: d[key] = result[key] inserts.append(('result', d)) if 'run' in response: run_id = response['run'].get('run-id', -1) # check if last run was not closed properly if run_id not in self._runs and self._last_run_id in self._runs: last_run = self._runs[self._last_run_id] if 'end' not in last_run: last_run['end'] = datetime.now() self._runs[run_id] = response['run'] self._last_run_id = run_id return inserts
def _tsdb_stdout(stdout): response = _AceResponse({ 'INPUT': None, 'NOTES': [], 'WARNINGS': [], 'ERRORS': [], 'SENT': None, 'RESULTS': [] }) line = stdout.readline().rstrip() while (line.startswith('NOTE:') or line.startswith('WARNING') or line.startswith('ERROR')): level, message = line.split(': ', 1) response['%sS' % level].append(message) line = stdout.readline().rstrip() while line: expr = SExpr.parse(line) line = expr.remainder assert len(expr.data) == 2 key, val = expr.data if key == ':p-input': response.setdefault('tokens', {})['initial'] = val.strip() elif key == ':p-tokens': response.setdefault('tokens', {})['internal'] = val.strip() elif key == ':results': for result in val: res = {} for reskey, resval in result: if reskey == ':derivation': res['DERIV'] = resval.strip() elif reskey == ':mrs': res['MRS'] = resval.strip() elif reskey == ':surface': res['SENT'] = resval.strip() elif isinstance(resval, stringtypes): res[reskey[1:]] = resval.strip() else: res[reskey[1:]] = resval response['RESULTS'].append(res) elif isinstance(val, stringtypes): response[key[1:]] = val.strip() else: response[key[1:]] = val return response
def tree(self): """ Deserialize and return a labeled syntax tree. The tree data may be a standalone datum, or embedded in the derivation. """ tree = self.get('tree') if isinstance(tree, stringtypes): tree = SExpr.parse(tree).data elif tree is None: drv = self.get('derivation') if isinstance(drv, dict) and 'label' in drv: def _extract_tree(d): t = [d.get('label', '')] if 'tokens' in d: t.append([d.get('form', '')]) else: for dtr in d.get('daughters', []): t.append(_extract_tree(dtr)) return t tree = _extract_tree(drv) return tree
def test_SExpr(): # atoms outside of parens # assert SExpr.parse('a').data == 'a' # assert SExpr.parse('1').data == 1 # assert SExpr.parse('1.0').data == 1.0 # assert SExpr.parse('"a"').data == 'a' # same as symbol? assert SExpr.parse('()').data == [] assert SExpr.parse('(a)').data == ['a'] assert SExpr.parse('(1)').data == [1] assert SExpr.parse('(1.0)').data == [1.0] assert SExpr.parse('("a")').data == ['a'] # same as symbol? assert SExpr.parse('( a . b )').data == ('a', 'b') assert SExpr.parse('( :a (b) )').data == [':a', ['b']] assert SExpr.parse('(a-a (b 1 2))').data == ['a-a', ['b', 1, 2]] assert SExpr.parse('("(a b)")').data == ['(a b)'] assert SExpr.parse('(a\\ b c)').data == ['a b', 'c'] assert SExpr.parse('(\\(a\\) \\[a\\] \\{a\\} \\; \\\\)').data == [ '(a)', '[a]', '{a}', ';', '\\' ] assert SExpr.parse('(:key . "\\"\\\\\\"a\\\\\\"\\"")').data == ( ":key", '"\\"a\\""') assert SExpr.parse('("\\"a\\"" \\" "\\(\\)\\;\\[\\]")').data == [ '"a"', '"', '\\(\\)\\;\\[\\]' ] # other kinds of whitespace assert SExpr.parse('(\ta\n.\n\n b)').data == ('a', 'b')
def test_SExpr_format(): assert SExpr.format([]) == '()' assert SExpr.format([1]) == '(1)' assert SExpr.format([1.0]) == '(1.0)' assert SExpr.format((1, 2)) == '(1 . 2)' assert SExpr.format(['a-a', ('b', 'c')]) == '(a-a (b . c))'
def test_SExpr(): # atoms outside of parens # assert SExpr.parse('a').data == 'a' # assert SExpr.parse('1').data == 1 # assert SExpr.parse('1.0').data == 1.0 # assert SExpr.parse('"a"').data == 'a' # same as symbol? assert SExpr.parse('()').data == [] assert SExpr.parse('(a)').data == ['a'] assert SExpr.parse('(1)').data == [1] assert SExpr.parse('(1.0)').data == [1.0] assert SExpr.parse('("a")').data == ['a'] # same as symbol? assert SExpr.parse('( a . b )').data == ('a', 'b') assert SExpr.parse('( :a (b) )').data == [':a', ['b']] assert SExpr.parse('(a-a (b 1 2))').data == ['a-a', ['b', 1, 2]] assert SExpr.parse('("(a b)")').data == ['(a b)'] assert SExpr.parse('(a\\ b c)').data == ['a b', 'c'] assert SExpr.parse('(\\(a\\) \\[a\\] \\{a\\} \\; \\\\)').data == [ '(a)', '[a]', '{a}', ';', '\\' ] assert SExpr.parse('(:key . "\\"\\\\\\"a\\\\\\"\\"")').data == ( ":key", '"\\"a\\""' ) assert SExpr.parse('("\\"a\\"" \\" "\\(\\)\\;\\[\\]")').data == [ '"a"', '"', '\\(\\)\\;\\[\\]' ]