예제 #1
0
    def clean_parens(tag):
        marked_tag = paren_utils.mark_depth(tag)
        for t in paren_utils.paren_iter(f'({tag})', bottom_up=True):
            depth = paren_utils.depth_at(tag, tag.index(t)) + 1
            t = paren_utils.mark_depth(t)
            j = 2
            while f'<{j}>' in t:
                t = t.replace(f'<{j}>', '(').replace(f'</{j}>', ')')
                j += 1
            mod = CCG_Tag._Modifier_RE.match(t)
            if mod and CCG_Tag.remove_features(
                    mod.group('a')) == CCG_Tag.remove_features(mod.group('b')):
                a = mod.group('a')
                b = mod.group('b')
                slash = mod.group('slash')
                marked_tag = marked_tag.replace(
                    f'<{depth}>{a}</{depth}>{slash}<{depth}>{b}</{depth}>',
                    f'({a}){slash}({b})', 1)
            elif CCG_Tag._Left_RE.search(t):
                x = CCG_Tag._Left_RE.match(t)
                a = x.group('a')
                marked_tag = marked_tag.replace(f'<{depth}>{a}</{depth}>', a,
                                                1)
            elif CCG_Tag._Right_RE.search(t):
                x = CCG_Tag._Right_RE.search(t)
                a = x.group('a')
                marked_tag = marked_tag.replace(f'<{depth}>{a}</{depth}>',
                                                f'({a})', 1)

        return paren_utils.unmark_depth(marked_tag)
예제 #2
0
파일: ccg.py 프로젝트: ablodge/ccg-utils
    def __str__(self):
        if CCGBank.test(self.text):

            ccg = self.text
            ccg = ccg.replace('{', '-LBR-').replace('}', '-RBR-')
            ccg = ccg.replace('(', '{').replace(')', '}')
            for p in CCGBank.Phrase_RE.finditer(ccg):
                tag = p.group('tag').replace('{', '(').replace('}', ')')
                ccg = ccg.replace(p.group(), tag)
            for w in CCGBank.Word_RE.finditer(ccg):
                tag = w.group('tag').replace('{', '(').replace('}', ')')
                word = w.group('word').replace('{', '(').replace('}', ')')
                ccg = ccg.replace(w.group(), tag + ' ' + word)
            max = paren_utils.max_depth(ccg, lparen='{', rparen='}')
            ccg = paren_utils.mark_depth(ccg, lparen='{', rparen='}')
            j = 1
            while j <= max:
                tabs = ''.join('    ' for i in range(j - 1))
                ccg = ccg.replace(f'<{j}>', '\n' + tabs + '{')
                j += 1
            ccg = re.sub(r'</[0-9]+>', '}', ccg)
            ccg = ccg.replace('-LBR-', '{').replace('-RBR-', '}')
            return ccg
        else:
            return self.text
예제 #3
0
    def latex(text):
        amr = AMR(text)
        text = str(amr)
        for x in re.findall('x[0-9]+ ?/ ?[^()\s]+', text):
            text = text.replace(x, '(' + x + ')')
        edges = [(e, id) for e, id in zip(amr.edges(), amr.edge_ids())]
        elems = []
        max_depth = paren_utils.max_depth(text)
        prev_depth = 0
        depth = 0

        i = 0
        node_depth = {}
        for t in paren_utils.paren_iter(text):
            node = amr.NODE_RE.match(t).group()
            id = node.split('/')[0].strip()
            # clean node
            if re.match('x[0-9]+/', node):
                node = node.split('/')[1]
            node = node.replace('"', '``', 1).replace('"', "''", 1)
            prev_depth = depth
            depth = paren_utils.depth_at(text, text.index(t))
            if depth > prev_depth:
                i = 0
            node_depth[id] = depth
            num_nodes = paren_utils.mark_depth(text).count(f'<{depth}>')
            x = AMR_Latex.get_x(i, num_nodes)
            y = AMR_Latex.get_y(depth, max_depth)
            color = AMR_Latex.get_color(i)
            elems.append(f'\t\\node[{color}]({id}) at ({x},{y}) {{{node}}};')
            i += 1
        for edge, id in edges:
            source = id.split('_')[0]
            target = id.split('_')[2]
            dir1 = 'south'
            dir2 = 'north'
            if node_depth[source] > node_depth[target]:
                dir1 = 'north'
                dir2 = 'south'
            if node_depth[source] == node_depth[target]:
                dir1 = 'north'
                dir2 = 'north'
            elems.append(
                f'\t\draw[->, thick] ({source}.{dir1}) -- ({target}.{dir2}) node[midway, above, sloped] {{{edge}}};'
            )
        latex = '\n\\begin{tikzpicture}[\n'
        latex += 'red/.style={rectangle, draw=red!60, fill=red!5, very thick, minimum size=7mm},\n'
        latex += 'blue/.style={rectangle, draw=blue!60, fill=blue!5, very thick, minimum size=7mm},\n'
        latex += 'green/.style={rectangle, draw=green!60, fill=green!5, very thick, minimum size=7mm},\n'
        latex += 'purple/.style={rectangle, draw=purple!60, fill=purple!5, very thick, minimum size=7mm},\n'
        latex += 'orange/.style={rectangle, draw=orange!60, fill=orange!5, very thick, minimum size=7mm},\n'
        latex += ']\n'
        latex += '\n'.join(elems)
        latex += '\n\end{tikzpicture}\n'

        return latex
예제 #4
0
 def named_entities(self):
     NE_RE = re.compile(
         f'(?P<root>{self.NODE_RE.pattern}).*:name\s+<1>(?P<name>.*?)</1>',
         re.DOTALL)
     for t in paren_utils.paren_iter(str(self)):
         t = paren_utils.mark_depth(t)
         x = NE_RE.match(t)
         if x:
             root = x.group('root')
             name = x.group('name')
             yield AMR(f'({root} :name ({name}) )')
예제 #5
0
    def to_html(tag):
        tag = CCG_Tag.add_indices(tag)
        x = paren_utils.mark_depth(tag)
        Paren_RE = re.compile('<1>.*?</1>')
        while Paren_RE.search(x):
            x = Paren_RE.sub('X', x)
        arg_count = len(re.findall(r'[/\\]', x))

        if arg_count > 0:
            tag = tag + f'<args> : {arg_count}</args>'
        elif tag == 'conj':
            tag = 'conj<args> : 2</args>'

        tag = tag.replace('[', '<sub>').replace(']', '</sub>')
        return tag
예제 #6
0
 def children(self):
     num_children = int(self._match.group('children'))
     ccg_phrase = paren_utils.mark_depth(self.phrase)
     x = self.Children_RE.search(ccg_phrase)
     a = x.group('a')
     a = paren_utils.unmark_depth(a)
     if num_children == 1:
         a = CCGBank.Phrase(a) if CCGBank.Phrase_RE.match(
             a) else CCGBank.Word(a)
         return [a]
     elif num_children == 2:
         b = x.group('b')
         if not b:
             print(self.phrase, x.group())
         b = paren_utils.unmark_depth(b)
         a = CCGBank.Phrase(a) if CCGBank.Phrase_RE.match(
             a) else CCGBank.Word(a)
         b = CCGBank.Phrase(b) if CCGBank.Phrase_RE.match(
             b) else CCGBank.Word(b)
         return [a, b]
     else:
         return []
예제 #7
0
 def children(self):
     ccg_phrase = paren_utils.mark_depth(self.phrase,
                                         lparen='{',
                                         rparen='}')
     num_children = ccg_phrase.count('<1>')
     x = self.Children_RE.search(ccg_phrase)
     a = x.group('a')
     a = paren_utils.unmark_depth(a, lparen='{', rparen='}')
     if num_children == 1:
         a = Readible.Phrase(a) if '{' in a else Readible.Word('{' + a +
                                                               '}')
         return [a]
     elif num_children == 2:
         a = Readible.Phrase(a) if '{' in a else Readible.Word('{' + a +
                                                               '}')
         b = x.group('b')
         if not b:
             print(self.phrase, x.group())
         b = paren_utils.unmark_depth(b, lparen='{', rparen='}')
         b = Readible.Phrase(b) if '{' in b else Readible.Word('{' + b +
                                                               '}')
         return [a, b]
     else:
         return []
예제 #8
0
    def add_indices(tag):
        old_tag = tag
        tag = paren_utils.mark_depth(tag)
        max = paren_utils.max_depth(tag)
        tag = tag.replace('NP[expl]', '*EXPL*')
        tag = tag.replace('NP[thr]', '*THR*')

        # get spans for each modifier pattern
        modifier_spans = []
        j = 1
        while j <= max:
            Modifier_RE = re.compile(
                fr'<{j}>(?P<a>.*?)</{j}>(?P<slash>[/\\])<{j}>(?P<b>.*?)</{j}>')
            for mod in Modifier_RE.finditer(tag):
                a = CCG_Tag.remove_features(mod.group('a'))
                b = CCG_Tag.remove_features(mod.group('b'))
                if a == b and 'NP' in a:
                    modifier_spans.append((mod.start('a'), mod.end('a'),
                                           mod.start('b'), mod.end('b')))
            j += 1

        Cat_RE = re.compile(r'([^<>()/\\]+|</?[0-9]+>|.)')
        cats = [c.group() for c in Cat_RE.finditer(tag)]
        cat_indices = [c.start() for c in Cat_RE.finditer(tag)]
        CATS = cats.copy()

        i = 1
        for j, c in enumerate(CATS):
            if c.startswith('NP'):
                cats[j] = f'{c}.{i}'
                i += 1

        if re.match(r'^NP[/\\]NP[/\\]?', tag):
            cats[0] = 'NP.1'
            cats[2] = 'NP.1'

        # handle matching indices within a modifier
        modifier_memo = []
        for a_start, a_end, b_start, b_end in modifier_spans:
            for j, c in enumerate(CATS):
                if c.startswith('NP'):
                    me = cat_indices[j]
                    if a_start <= me < a_end:
                        x = re.match('.*[.](?P<n>[0-9]+)$', cats[j])
                        if x:
                            modifier_memo.append(int(x.group('n')))
                    elif b_start <= me < b_end:
                        m = modifier_memo.pop(0)
                        cats[j] = f'{c}.{m}'
        i = 1
        for j, c in enumerate(CATS):
            if c.startswith('NP'):
                x = re.match('.*[.](?P<n>[0-9]+)$', cats[j])
                if x and int(x.group('n')) > i:
                    cats[j] = f'{c}.{i}'
                    continue
                elif x and int(x.group('n')) < i:
                    continue
                i += 1

        tag = ''.join(cats)

        if tag.count('NP') < 2:
            tag = old_tag
        # fix parens for "want", "should", etc.
        # If nodes are the same but features are different,
        # remove parentheses around first half of expression.
        # This is important for getting number of args!
        j = 1
        while j <= max:
            Modifier_RE = re.compile(
                fr'<{j}>(?P<a>.*?)</{j}>(?P<slash>[/\\])<{j}>(?P<b>.*?)</{j}>')
            for mod in Modifier_RE.finditer(tag):
                a = mod.group('a')
                b = mod.group('b')
                slash = mod.group('slash')
                if a != b:
                    tag = tag.replace(mod.group(), f'{a}{slash}({b})')
            j += 1
        tag = paren_utils.unmark_depth(tag)
        obj_ctrl = re.match(
            r'[(]?S\[.*?\]\\NP.1[)]?/[(]S\[.*?\]\\NP.1[)]/NP.2', tag)
        obj_raise = re.match(
            r'[(]?S\[.*?\]\\NP.1[)]?/[(]S\[.*?\]\\NP.2/NP.3[)]', tag)
        if obj_ctrl:
            tag = tag.replace('NP.1)/NP.2', 'NP.2)/NP.2', 1)
        if obj_raise:
            tag = tag.replace('NP.2/NP.3)', 'NP.2/NP.1)', 1)

        r'S[adj]\NP.1/(S[to]\NP.2/NP.1)'
        tag = tag.replace('*EXPL*', 'NP[expl]')
        tag = tag.replace('*THR*', 'NP[thr]')
        return tag