示例#1
0
def _read_ace_parse(s):
    from delphin.mrs import simplemrs
    if hasattr(s, 'decode'):
        s = s.decode('utf-8')
    surface = None
    newline = False
    for line in s.splitlines():
        if line.startswith('SENT: '):
            surface = line[6:]
        # regular ACE output
        elif line.startswith('['):
            m = line.partition(' ;  ')[0].strip()
            m = simplemrs.loads(m, single=True)
            m.surface = surface
            yield m
        # with --tsdb-stdout
        elif line.startswith('('):
            while line:
                expr = SExpr.parse(line)
                line = expr.remainder.lstrip()
                if len(expr.data) == 2 and expr.data[0] == ':results':
                    for result in expr.data[1]:
                        for key, val in result:
                            if key == ':mrs':
                                yield simplemrs.loads(val, single=True)
        elif line == '\n':
            if newline:
                surface = None
                newline = False
            else:
                newline = True
        else:
            pass
示例#2
0
def _tsdb_response(response, line):
    while line:
        expr = SExpr.parse(line)
        line = expr.remainder
        if len(expr.data) != 2:
            logging.error('Malformed output from ACE: {}'.format(line))
            break
        key, val = expr.data
        if key == ':p-input':
            response.setdefault('tokens', {})['initial'] = val.strip()
        elif key == ':p-tokens':
            response.setdefault('tokens', {})['internal'] = val.strip()
        elif key == ':results':
            for result in val:
                res = {}
                for reskey, resval in result:
                    if reskey == ':derivation':
                        res['DERIV'] = resval.strip()
                    elif reskey == ':mrs':
                        res['MRS'] = resval.strip()
                    elif reskey == ':surface':
                        res['SENT'] = resval.strip()
                    elif isinstance(resval, stringtypes):
                        res[reskey[1:]] = resval.strip()
                    else:
                        res[reskey[1:]] = resval
                response['RESULTS'].append(res)
        elif isinstance(val, stringtypes):
            response[key[1:]] = val.strip()
        else:
            response[key[1:]] = val
    return response
示例#3
0
def _decode(lines):
    surface = None
    newline = False
    for line in lines:
        if line.startswith('SENT: '):
            surface = line[6:].rstrip()
        # regular ACE output
        elif line.startswith('['):
            m = line.partition(' ;  ')[0].strip()
            m = simplemrs.decode(m)
            m.surface = surface
            yield m
        # with --tsdb-stdout
        elif line.startswith('('):
            while line:
                data, remainder = SExpr.parse(line)
                line = remainder.lstrip()
                if len(data) == 2 and data[0] == ':results':
                    for result in data[1]:
                        for key, val in result:
                            if key == ':mrs':
                                yield simplemrs.decode(val)
        elif line == '\n':
            if newline:
                surface = None
                newline = False
            else:
                newline = True
        else:
            pass
示例#4
0
def _sexpr_data(line):
    while line:
        expr = SExpr.parse(line)
        if len(expr.data) != 2:
            logging.error('Malformed output from ACE: {}'.format(line))
            break
        line = expr.remainder.lstrip()
        yield expr.data
示例#5
0
    def map(self, response):
        """
        Process *response* and return a list of (table, rowdata) tuples.
        """
        inserts = []

        parse = {}
        # custom remapping, cleanup, and filling in holes
        parse['i-id'] = response.get('keys', {}).get('i-id', -1)
        self._parse_id = max(self._parse_id + 1, parse['i-id'])
        parse['parse-id'] = self._parse_id
        parse['run-id'] = response.get('run', {}).get('run-id', -1)
        if 'tokens' in response:
            parse['p-input'] = response['tokens'].get('initial')
            parse['p-tokens'] = response['tokens'].get('internal')
            if 'ninputs' not in response:
                toks = response.tokens('initial')
                if toks is not None:
                    response['ninputs'] = len(toks.tokens)
            if 'ntokens' not in response:
                toks = response.tokens('internal')
                if toks is not None:
                    response['ntokens'] = len(toks.tokens)
        if 'readings' not in response and 'results' in response:
            response['readings'] = len(response['results'])
        # basic mapping
        for key in self._parse_keys:
            if key in response:
                parse[key] = response[key]
        inserts.append(('parse', parse))

        for result in response.get('results', []):
            d = {'parse-id': self._parse_id}
            if 'flags' in result:
                d['flags'] = SExpr.format(result['flags'])
            for key in self._result_keys:
                if key in result:
                    d[key] = result[key]
            inserts.append(('result', d))

        if 'run' in response:
            run_id = response['run'].get('run-id', -1)
            # check if last run was not closed properly
            if run_id not in self._runs and self._last_run_id in self._runs:
                last_run = self._runs[self._last_run_id]
                if 'end' not in last_run:
                    last_run['end'] = datetime.now()
            self._runs[run_id] = response['run']
            self._last_run_id = run_id

        return inserts
示例#6
0
文件: ace.py 项目: moreymat/pydelphin
def _tsdb_stdout(stdout):
    response = _AceResponse({
        'INPUT': None,
        'NOTES': [],
        'WARNINGS': [],
        'ERRORS': [],
        'SENT': None,
        'RESULTS': []
    })

    line = stdout.readline().rstrip()
    while (line.startswith('NOTE:') or
           line.startswith('WARNING') or
           line.startswith('ERROR')):
        level, message = line.split(': ', 1)
        response['%sS' % level].append(message)
        line = stdout.readline().rstrip()
    while line:
        expr = SExpr.parse(line)
        line = expr.remainder
        assert len(expr.data) == 2
        key, val = expr.data
        if key == ':p-input':
            response.setdefault('tokens', {})['initial'] = val.strip()
        elif key == ':p-tokens':
            response.setdefault('tokens', {})['internal'] = val.strip()
        elif key == ':results':
            for result in val:
                res = {}
                for reskey, resval in result:
                    if reskey == ':derivation':
                        res['DERIV'] = resval.strip()
                    elif reskey == ':mrs':
                        res['MRS'] = resval.strip()
                    elif reskey == ':surface':
                        res['SENT'] = resval.strip()
                    elif isinstance(resval, stringtypes):
                        res[reskey[1:]] = resval.strip()
                    else:
                        res[reskey[1:]] = resval
                response['RESULTS'].append(res)
        elif isinstance(val, stringtypes):
            response[key[1:]] = val.strip()
        else:
            response[key[1:]] = val
    return response
示例#7
0
def _tsdb_stdout(stdout):
    response = _AceResponse({
        'INPUT': None,
        'NOTES': [],
        'WARNINGS': [],
        'ERRORS': [],
        'SENT': None,
        'RESULTS': []
    })

    line = stdout.readline().rstrip()
    while (line.startswith('NOTE:') or line.startswith('WARNING')
           or line.startswith('ERROR')):
        level, message = line.split(': ', 1)
        response['%sS' % level].append(message)
        line = stdout.readline().rstrip()
    while line:
        expr = SExpr.parse(line)
        line = expr.remainder
        assert len(expr.data) == 2
        key, val = expr.data
        if key == ':p-input':
            response.setdefault('tokens', {})['initial'] = val.strip()
        elif key == ':p-tokens':
            response.setdefault('tokens', {})['internal'] = val.strip()
        elif key == ':results':
            for result in val:
                res = {}
                for reskey, resval in result:
                    if reskey == ':derivation':
                        res['DERIV'] = resval.strip()
                    elif reskey == ':mrs':
                        res['MRS'] = resval.strip()
                    elif reskey == ':surface':
                        res['SENT'] = resval.strip()
                    elif isinstance(resval, stringtypes):
                        res[reskey[1:]] = resval.strip()
                    else:
                        res[reskey[1:]] = resval
                response['RESULTS'].append(res)
        elif isinstance(val, stringtypes):
            response[key[1:]] = val.strip()
        else:
            response[key[1:]] = val
    return response
示例#8
0
 def tree(self):
     """
     Deserialize and return a labeled syntax tree. The tree data
     may be a standalone datum, or embedded in the derivation.
     """
     tree = self.get('tree')
     if isinstance(tree, stringtypes):
         tree = SExpr.parse(tree).data
     elif tree is None:
         drv = self.get('derivation')
         if isinstance(drv, dict) and 'label' in drv:
             def _extract_tree(d):
                 t = [d.get('label', '')]
                 if 'tokens' in d:
                     t.append([d.get('form', '')])
                 else:
                     for dtr in d.get('daughters', []):
                         t.append(_extract_tree(dtr))
                 return t
             tree = _extract_tree(drv)
     return tree
示例#9
0
    def tree(self):
        """
        Deserialize and return a labeled syntax tree. The tree data
        may be a standalone datum, or embedded in the derivation.
        """
        tree = self.get('tree')
        if isinstance(tree, stringtypes):
            tree = SExpr.parse(tree).data
        elif tree is None:
            drv = self.get('derivation')
            if isinstance(drv, dict) and 'label' in drv:

                def _extract_tree(d):
                    t = [d.get('label', '')]
                    if 'tokens' in d:
                        t.append([d.get('form', '')])
                    else:
                        for dtr in d.get('daughters', []):
                            t.append(_extract_tree(dtr))
                    return t

                tree = _extract_tree(drv)
        return tree
示例#10
0
def test_SExpr():
    # atoms outside of parens
    # assert SExpr.parse('a').data == 'a'
    # assert SExpr.parse('1').data == 1
    # assert SExpr.parse('1.0').data == 1.0
    # assert SExpr.parse('"a"').data == 'a'  # same as symbol?
    assert SExpr.parse('()').data == []
    assert SExpr.parse('(a)').data == ['a']
    assert SExpr.parse('(1)').data == [1]
    assert SExpr.parse('(1.0)').data == [1.0]
    assert SExpr.parse('("a")').data == ['a']  # same as symbol?
    assert SExpr.parse('( a . b )').data == ('a', 'b')
    assert SExpr.parse('( :a (b) )').data == [':a', ['b']]
    assert SExpr.parse('(a-a (b 1 2))').data == ['a-a', ['b', 1, 2]]
    assert SExpr.parse('("(a b)")').data == ['(a b)']

    assert SExpr.parse('(a\\ b c)').data == ['a b', 'c']
    assert SExpr.parse('(\\(a\\) \\[a\\] \\{a\\} \\; \\\\)').data == [
        '(a)', '[a]', '{a}', ';', '\\'
    ]
    assert SExpr.parse('(:key . "\\"\\\\\\"a\\\\\\"\\"")').data == (
        ":key", '"\\"a\\""')
    assert SExpr.parse('("\\"a\\"" \\" "\\(\\)\\;\\[\\]")').data == [
        '"a"', '"', '\\(\\)\\;\\[\\]'
    ]
    # other kinds of whitespace
    assert SExpr.parse('(\ta\n.\n\n  b)').data == ('a', 'b')
示例#11
0
def test_SExpr_format():
    assert SExpr.format([]) == '()'
    assert SExpr.format([1]) == '(1)'
    assert SExpr.format([1.0]) == '(1.0)'
    assert SExpr.format((1, 2)) == '(1 . 2)'
    assert SExpr.format(['a-a', ('b', 'c')]) == '(a-a (b . c))'
示例#12
0
def test_SExpr():
    # atoms outside of parens
    # assert SExpr.parse('a').data == 'a'
    # assert SExpr.parse('1').data == 1
    # assert SExpr.parse('1.0').data == 1.0
    # assert SExpr.parse('"a"').data == 'a'  # same as symbol?
    assert SExpr.parse('()').data == []
    assert SExpr.parse('(a)').data == ['a']
    assert SExpr.parse('(1)').data == [1]
    assert SExpr.parse('(1.0)').data == [1.0]
    assert SExpr.parse('("a")').data == ['a']  # same as symbol?
    assert SExpr.parse('( a . b )').data == ('a', 'b')
    assert SExpr.parse('( :a (b) )').data == [':a', ['b']]
    assert SExpr.parse('(a-a (b 1 2))').data == ['a-a', ['b', 1, 2]]
    assert SExpr.parse('("(a b)")').data == ['(a b)']

    assert SExpr.parse('(a\\ b c)').data == ['a b', 'c']
    assert SExpr.parse('(\\(a\\) \\[a\\] \\{a\\} \\; \\\\)').data == [
        '(a)', '[a]', '{a}', ';', '\\'
    ]
    assert SExpr.parse('(:key . "\\"\\\\\\"a\\\\\\"\\"")').data == (
        ":key", '"\\"a\\""'
    )
    assert SExpr.parse('("\\"a\\"" \\" "\\(\\)\\;\\[\\]")').data == [
        '"a"', '"', '\\(\\)\\;\\[\\]'
    ]