Exemplo n.º 1
0
def break2sentences(passage, lang="en", *args, **kwargs):
    """
    Breaks paragraphs into sentences according to the annotation.

    A sentence is a list of terminals which ends with a mark from
    SENTENCE_END_MARKS, and is also the end of a paragraph or parallel scene.
    :param passage: the Passage object to operate on
    :param lang: optional two-letter language code
    :return a list of positions in the Passage, each denotes a closing Terminal of a sentence.
    """
    del args, kwargs
    l1 = passage.layer(layer1.LAYER_ID)
    terminals = extract_terminals(passage)
    if not terminals:
        return []
    if any(n.outgoing for n in l1.all):  # Passage is labeled
        ps_ends = [ps.end_position for ps in l1.top_scenes]
        ps_starts = [ps.start_position for ps in l1.top_scenes]
        marks = [t.position for t in terminals if t.text in SENTENCE_END_MARKS]
        # Annotations doesn't always include the ending period (or other mark)
        # with the parallel scene it closes. Hence, if the terminal before the
        # mark closed the parallel scene, and this mark doesn't open a scene
        # in any way (hence it probably just "hangs" there), it's a sentence end
        marks = [x for x in marks if x in ps_ends or ((x - 1) in ps_ends and x not in ps_starts)]
    else:  # Not labeled, split using spaCy
        annotated = get_nlp(lang=lang)([t.text for t in terminals])
        marks = [span.end for span in annotated.sents]
    marks = sorted(set(marks + break2paragraphs(passage)))
    # Avoid punctuation-only sentences
    if len(marks) > 1:
        marks = [x for x, y in zip(marks[:-1], marks[1:]) if not all(layer0.is_punct(t) for t in terminals[x:y])] + \
                [marks[-1]]
    return marks
Exemplo n.º 2
0
Arquivo: node.py Projeto: ml-lab/tupa
 def add_to_l1(self, l1, parent, tag, terminals, train):
     """
     Called when creating final Passage to add a new core.Node
     :param l1: Layer1 of the passage
     :param parent: node
     :param tag: edge tag to link to parent
     :param terminals: all terminals strings in the passage
     :param train: in training, so keep original node IDs in the "remarks" field
     """
     if Config().args.verify:
         assert self.node is None or self.text is not None,\
             "Trying to create the same node twice: %s, parent: %s" % (self.node.ID, parent)
     edge = self.outgoing[0] if len(self.outgoing) == 1 else None
     if self.text:  # For Word terminals (Punctuation already created by add_punct for parent)
         if self.node is None and parent.node is not None:
             self.node = parent.node.add(EdgeTags.Terminal,
                                         terminals[self.index]).child
     elif edge and edge.child.text and layer0.is_punct(terminals[edge.child.index]):
         if Config().args.verify:
             assert tag == EdgeTags.Punctuation, "Tag for %s is %s" % (parent.node_id, tag)
             assert edge.tag == EdgeTags.Terminal, "Tag for %s is %s" % (self.node_id, edge.tag)
         self.node = l1.add_punct(parent.node, terminals[edge.child.index])
         edge.child.node = self.node[0].child
     else:  # The usual case
         self.node = l1.add_fnode(parent.node, tag, implicit=self.implicit)
     if train and self.node is not None and self.node_id is not None:  # In training
         self.node.extra["remarks"] = self.node_id  # Keep original node ID for reference
Exemplo n.º 3
0
 def add_to_l1(self, l1, parent, tag, terminals, train):
     """
     Called when creating final Passage to add a new core.Node
     :param l1: Layer1 of the passage
     :param parent: node
     :param tag: edge tag to link to parent
     :param terminals: all terminals strings in the passage
     :param train: in training, so keep original node IDs in the "remarks" field
     """
     if Config().args.verify:
         assert self.node is None or self.text is not None,\
             "Trying to create the same node twice: %s, parent: %s" % (self.node.ID, parent)
     edge = self.outgoing[0] if len(self.outgoing) == 1 else None
     if self.text:  # For Word terminals (Punctuation already created by add_punct for parent)
         if self.node is None and parent.node is not None:
             self.node = parent.node.add(EdgeTags.Terminal,
                                         terminals[self.index]).child
     elif edge and edge.child.text and layer0.is_punct(terminals[edge.child.index]):
         if Config().args.verify:
             assert tag == EdgeTags.Punctuation, "Tag for %s is %s" % (parent.node_id, tag)
             assert edge.tag == EdgeTags.Terminal, "Tag for %s is %s" % (self.node_id, edge.tag)
         self.node = l1.add_punct(parent.node, terminals[edge.child.index])
         edge.child.node = self.node[0].child
     else:  # The usual case
         self.node = l1.add_fnode(parent.node, tag, implicit=self.implicit)
     if train and self.node is not None and self.node_id is not None:  # In training
         self.node.extra["remarks"] = self.node_id  # Keep original node ID for reference
Exemplo n.º 4
0
def break2sentences(passage):
    """
    Breaks paragraphs into sentences according to the annotation.

    A sentence is a list of terminals which ends with a mark from
    SENTENCE_END_MARKS, and is also the end of a paragraph or parallel scene.
    :param passage: the Passage object to operate on
    :return: a list of positions in the Passage, each denotes a closing Terminal of a sentence.
    """
    l1 = passage.layer(layer1.LAYER_ID)
    terminals = extract_terminals(passage)
    if any(n.outgoing for n in l1.all):  # Passage is labeled
        ps_ends = [ps.end_position for ps in l1.top_scenes]
        ps_starts = [ps.start_position for ps in l1.top_scenes]
        marks = [t.position for t in terminals if t.text in SENTENCE_END_MARKS]
        # Annotations doesn't always include the ending period (or other mark)
        # with the parallel scene it closes. Hence, if the terminal before the
        # mark closed the parallel scene, and this mark doesn't open a scene
        # in any way (hence it probably just "hangs" there), it's a sentence end
        marks = [x for x in marks if x in ps_ends or ((x - 1) in ps_ends and x not in ps_starts)]
    else:  # Not labeled, split using spaCy
        annotated = nlp([t.text for t in terminals])
        marks = [span.end for span in annotated.sents]
    marks = sorted(set(marks + break2paragraphs(passage)))
    # Avoid punctuation-only sentences
    if len(marks) > 1:
        marks = [x for x, y in zip(marks[:-1], marks[1:])
                 if not all(layer0.is_punct(t) for t in terminals[x:y])] +\
                [marks[-1]]
    return marks
Exemplo n.º 5
0
 def add(self, edge_tag, node, *, edge_attrib=None):
     if node.layer.ID != layer0.LAYER_ID:
         raise ValueError("Non-terminal child (%s) for %s node (%s)" %
                          (node.ID, NodeTags.Punctuation, self.ID))
     if not layer0.is_punct(node):
         node.tag = layer0.NodeTags.Punct
         # raise ValueError("%s child (%s) for %s node (%s)" % (node.tag, node.ID, NodeTags.Punctuation, self.ID))
     super().add(edge_tag, node, edge_attrib=None)
Exemplo n.º 6
0
 def _label(dep_edge, top=False):
     dependent_rels = {e.stripped_rel for e in dep_edge.dependent}
     if dep_edge.dependent.terminal and layer0.is_punct(dep_edge.dependent.terminal):
         return EdgeTags.Punctuation
     elif top or EdgeTags.ParallelScene in dependent_rels:
         return EdgeTags.ParallelScene
     elif dependent_rels.intersection((EdgeTags.Participant, EdgeTags.Adverbial)):
         return EdgeTags.Process  # May be State but we can't tell
     else:
         return EdgeTags.Center
Exemplo n.º 7
0
 def _build_layer0(preterminals, l1, l0):  # add edges to terminals according to alignments
     for i, parents in preterminals.items():
         terminal = l0.all[i]
         if layer0.is_punct(terminal):
             tag = layer1.EdgeTags.Punctuation
             terminal = l1.add_punct(parents[0], terminal)
             terminal.attrib[LABEL_ATTRIB] = layer1.NodeTags.Punctuation
         else:
             tag = layer1.EdgeTags.Terminal
         for parent in parents:
             if parent not in terminal.parents:  # avoid multiple identical edges (e.g. :polarity~e.68 -~e.68)
                 parent.add(tag, terminal)
Exemplo n.º 8
0
 def _build_layer0(preterminals, l1, l0):  # add edges to terminals according to alignments
     for i, parents in preterminals.items():
         terminal = l0.all[i]
         if layer0.is_punct(terminal):
             tag = PUNCTUATION_DEP
             terminal = l1.add_punct(parents[0], terminal)
             terminal.attrib[LABEL_ATTRIB] = PUNCTUATION_LABEL
             del parents[1:]  # avoid multiple punctuation parents, which is mostly due to alignment errors
         else:
             tag = TERMINAL_DEP
         for parent in parents:
             if parent not in terminal.parents:  # avoid multiple identical edges (e.g. :polarity~e.68 -~e.68)
                 parent.add(tag, terminal)
Exemplo n.º 9
0
 def add_to_l1(self, l0, l1, parent, tag, labeled, node_labels):
     """
     Called when creating final Passage to add a new core.Node
     :param l0: Layer0 of the passage
     :param l1: Layer1 of the passage
     :param parent: node
     :param tag: edge tag to link to parent
     :param labeled: there is a reference passage, so keep original node IDs in the "remarks" field
     :param node_labels: whether to add a node label
     """
     edge = self.outgoing[0] if len(self.outgoing) == 1 else None
     if self.text:  # For Word terminals (Punctuation already created by add_punct for parent)
         if parent.node is not None:
             if self.node is None:
                 self.node = parent.node.add(EdgeTags.Terminal,
                                             self.get_terminal(l0)).child
             elif self.node not in parent.node.children:
                 parent.node.add(EdgeTags.Terminal, self.node)
     elif edge and edge.child.text and layer0.is_punct(
             edge.child.get_terminal(l0)):
         if Config().args.verify:
             assert tag == EdgeTags.Punctuation, "Punctuation parent %s's edge tag is %s" % (
                 parent.node_id, tag)
             assert edge.tag == EdgeTags.Terminal, "Punctuation %s's edge tag is %s" % (
                 self.node_id, edge.tag)
         if self.node is None:
             self.node = l1.add_punct(parent.node,
                                      edge.child.get_terminal(l0))
             edge.child.node = self.node[0].child
         elif parent.node is not None and self.node not in parent.node.children:
             parent.node.add(EdgeTags.Punctuation, self.node)
     else:  # The usual case
         assert self.node is None, "Trying to create the same node twice (multiple incoming primary edges): " + \
                                   ", ".join(map(str, self.incoming))
         if parent is not None and parent.label and parent.node is None:  # If parent is an orphan and has a a label,
             parent.add_to_l1(l0, l1, None,
                              Config().args.orphan_label, labeled,
                              node_labels)  # link to root
         self.node = l1.add_fnode(None if parent is None else parent.node,
                                  tag,
                                  implicit=self.implicit)
     if labeled:  # In training
         self.set_node_id()
     if node_labels:
         self.set_node_label()
Exemplo n.º 10
0
def break2sentences(passage, lang="en", *args, **kwargs):
    """
    Breaks paragraphs into sentences according to the annotation.

    A sentence is a list of terminals which ends with a mark from
    SENTENCE_END_MARKS, and is also the end of a paragraph or parallel scene.
    :param passage: the Passage object to operate on
    :param lang: optional two-letter language code
    :return: a list of positions in the Passage, each denotes a closing Terminal of a sentence.
    """
    del args, kwargs
    l1 = passage.layer(layer1.LAYER_ID)
    terminals = extract_terminals(passage)
    if not terminals:
        return []
    if any(n.outgoing for n in l1.all):  # Passage is labeled
        ps_ends = [ps.end_position for ps in l1.top_scenes]
        ps_starts = [ps.start_position for ps in l1.top_scenes]
        marks = []
        for terminal in terminals:
            # Annotations doesn't always include the ending period (or other mark)
            # with the parallel scene it closes. Hence, if the terminal before the
            # mark closed the parallel scene, and this mark doesn't open a scene
            # in any way (hence it probably just "hangs" there), it's a sentence end
            if terminal.text in SENTENCE_END_MARKS and \
                    (terminal.position in ps_ends or
                     (terminal.position - 1) in ps_ends and terminal.position not in ps_starts) or \
                    terminal.position - 1 in marks and layer0.is_punct(terminal) and not \
                    (terminal.text in QUOTES and terminal.text == terminals[marks[-1] - 1].text):
                marks.append(terminal.position)
    else:  # Not labeled, split using spaCy
        annotated = get_nlp(lang=lang)([t.text for t in terminals])
        marks = [span.end for span in annotated.sents]
    marks = sorted(set(marks + break2paragraphs(passage)))
    # Avoid punctuation-only sentences by picking the last punctuation symbol in each consecutive sequence
    if len(marks) > 1:
        marks = [x for x, y in zip(marks[:-1], marks[1:]) if not all(map(layer0.is_punct, terminals[x - 1:y - 1]))] + \
                [marks[-1]]
    return marks
Exemplo n.º 11
0
 def add_to_l1(self, l0, l1, parent, tag, labeled, node_labels):
     """
     Called when creating final Passage to add a new core.Node
     :param l0: Layer0 of the passage
     :param l1: Layer1 of the passage
     :param parent: node
     :param tag: edge tag to link to parent
     :param labeled: there is a reference passage, so keep original node IDs in the "remarks" field
     :param node_labels: whether to add a node label
     """
     edge = self.outgoing[0] if len(self.outgoing) == 1 else None
     if self.text:  # For Word terminals (Punctuation already created by add_punct for parent)
         if parent.node is not None:
             if self.node is None:
                 self.node = parent.node.add(EdgeTags.Terminal, self.get_terminal(l0)).child
             elif self.node not in parent.node.children:
                 parent.node.add(EdgeTags.Terminal, self.node)
     elif edge and edge.child.text and layer0.is_punct(edge.child.get_terminal(l0)):
         if Config().args.verify:
             assert tag == EdgeTags.Punctuation, "Punctuation parent %s's edge tag is %s" % (parent.node_id, tag)
             assert edge.tag == EdgeTags.Terminal, "Punctuation %s's edge tag is %s" % (self.node_id, edge.tag)
         if self.node is None:
             self.node = l1.add_punct(parent.node, edge.child.get_terminal(l0))
             edge.child.node = self.node[0].child
         elif parent.node is not None and self.node not in parent.node.children:
             parent.node.add(EdgeTags.Punctuation, self.node)
     else:  # The usual case
         assert self.node is None, "Trying to create the same node twice (multiple incoming primary edges): " + \
                                   ", ".join(map(str, self.incoming))
         if parent is not None and parent.label and parent.node is None:  # If parent is an orphan and has a a label,
             parent.add_to_l1(l0, l1, None, Config().args.orphan_label, labeled, node_labels)  # link to root
         self.node = l1.add_fnode(None if parent is None else parent.node, tag, implicit=self.implicit)
     if labeled:  # In training
         self.set_node_id()
     if node_labels:
         self.set_node_label()
Exemplo n.º 12
0
def attach_punct(l0, l1):
    for terminal in l0.all:
        if layer0.is_punct(terminal) and not terminal.incoming:
            l1.add_punct(nearest_parent(l0, terminal), terminal)