def break2sentences(passage, lang="en", *args, **kwargs): """ Breaks paragraphs into sentences according to the annotation. A sentence is a list of terminals which ends with a mark from SENTENCE_END_MARKS, and is also the end of a paragraph or parallel scene. :param passage: the Passage object to operate on :param lang: optional two-letter language code :return a list of positions in the Passage, each denotes a closing Terminal of a sentence. """ del args, kwargs l1 = passage.layer(layer1.LAYER_ID) terminals = extract_terminals(passage) if not terminals: return [] if any(n.outgoing for n in l1.all): # Passage is labeled ps_ends = [ps.end_position for ps in l1.top_scenes] ps_starts = [ps.start_position for ps in l1.top_scenes] marks = [t.position for t in terminals if t.text in SENTENCE_END_MARKS] # Annotations doesn't always include the ending period (or other mark) # with the parallel scene it closes. Hence, if the terminal before the # mark closed the parallel scene, and this mark doesn't open a scene # in any way (hence it probably just "hangs" there), it's a sentence end marks = [x for x in marks if x in ps_ends or ((x - 1) in ps_ends and x not in ps_starts)] else: # Not labeled, split using spaCy annotated = get_nlp(lang=lang)([t.text for t in terminals]) marks = [span.end for span in annotated.sents] marks = sorted(set(marks + break2paragraphs(passage))) # Avoid punctuation-only sentences if len(marks) > 1: marks = [x for x, y in zip(marks[:-1], marks[1:]) if not all(layer0.is_punct(t) for t in terminals[x:y])] + \ [marks[-1]] return marks
def add_to_l1(self, l1, parent, tag, terminals, train): """ Called when creating final Passage to add a new core.Node :param l1: Layer1 of the passage :param parent: node :param tag: edge tag to link to parent :param terminals: all terminals strings in the passage :param train: in training, so keep original node IDs in the "remarks" field """ if Config().args.verify: assert self.node is None or self.text is not None,\ "Trying to create the same node twice: %s, parent: %s" % (self.node.ID, parent) edge = self.outgoing[0] if len(self.outgoing) == 1 else None if self.text: # For Word terminals (Punctuation already created by add_punct for parent) if self.node is None and parent.node is not None: self.node = parent.node.add(EdgeTags.Terminal, terminals[self.index]).child elif edge and edge.child.text and layer0.is_punct(terminals[edge.child.index]): if Config().args.verify: assert tag == EdgeTags.Punctuation, "Tag for %s is %s" % (parent.node_id, tag) assert edge.tag == EdgeTags.Terminal, "Tag for %s is %s" % (self.node_id, edge.tag) self.node = l1.add_punct(parent.node, terminals[edge.child.index]) edge.child.node = self.node[0].child else: # The usual case self.node = l1.add_fnode(parent.node, tag, implicit=self.implicit) if train and self.node is not None and self.node_id is not None: # In training self.node.extra["remarks"] = self.node_id # Keep original node ID for reference
def add_to_l1(self, l1, parent, tag, terminals, train): """ Called when creating final Passage to add a new core.Node :param l1: Layer1 of the passage :param parent: node :param tag: edge tag to link to parent :param terminals: all terminals strings in the passage :param train: in training, so keep original node IDs in the "remarks" field """ if Config().args.verify: assert self.node is None or self.text is not None,\ "Trying to create the same node twice: %s, parent: %s" % (self.node.ID, parent) edge = self.outgoing[0] if len(self.outgoing) == 1 else None if self.text: # For Word terminals (Punctuation already created by add_punct for parent) if self.node is None and parent.node is not None: self.node = parent.node.add(EdgeTags.Terminal, terminals[self.index]).child elif edge and edge.child.text and layer0.is_punct(terminals[edge.child.index]): if Config().args.verify: assert tag == EdgeTags.Punctuation, "Tag for %s is %s" % (parent.node_id, tag) assert edge.tag == EdgeTags.Terminal, "Tag for %s is %s" % (self.node_id, edge.tag) self.node = l1.add_punct(parent.node, terminals[edge.child.index]) edge.child.node = self.node[0].child else: # The usual case self.node = l1.add_fnode(parent.node, tag, implicit=self.implicit) if train and self.node is not None and self.node_id is not None: # In training self.node.extra["remarks"] = self.node_id # Keep original node ID for reference
def break2sentences(passage): """ Breaks paragraphs into sentences according to the annotation. A sentence is a list of terminals which ends with a mark from SENTENCE_END_MARKS, and is also the end of a paragraph or parallel scene. :param passage: the Passage object to operate on :return: a list of positions in the Passage, each denotes a closing Terminal of a sentence. """ l1 = passage.layer(layer1.LAYER_ID) terminals = extract_terminals(passage) if any(n.outgoing for n in l1.all): # Passage is labeled ps_ends = [ps.end_position for ps in l1.top_scenes] ps_starts = [ps.start_position for ps in l1.top_scenes] marks = [t.position for t in terminals if t.text in SENTENCE_END_MARKS] # Annotations doesn't always include the ending period (or other mark) # with the parallel scene it closes. Hence, if the terminal before the # mark closed the parallel scene, and this mark doesn't open a scene # in any way (hence it probably just "hangs" there), it's a sentence end marks = [x for x in marks if x in ps_ends or ((x - 1) in ps_ends and x not in ps_starts)] else: # Not labeled, split using spaCy annotated = nlp([t.text for t in terminals]) marks = [span.end for span in annotated.sents] marks = sorted(set(marks + break2paragraphs(passage))) # Avoid punctuation-only sentences if len(marks) > 1: marks = [x for x, y in zip(marks[:-1], marks[1:]) if not all(layer0.is_punct(t) for t in terminals[x:y])] +\ [marks[-1]] return marks
def add(self, edge_tag, node, *, edge_attrib=None): if node.layer.ID != layer0.LAYER_ID: raise ValueError("Non-terminal child (%s) for %s node (%s)" % (node.ID, NodeTags.Punctuation, self.ID)) if not layer0.is_punct(node): node.tag = layer0.NodeTags.Punct # raise ValueError("%s child (%s) for %s node (%s)" % (node.tag, node.ID, NodeTags.Punctuation, self.ID)) super().add(edge_tag, node, edge_attrib=None)
def _label(dep_edge, top=False): dependent_rels = {e.stripped_rel for e in dep_edge.dependent} if dep_edge.dependent.terminal and layer0.is_punct(dep_edge.dependent.terminal): return EdgeTags.Punctuation elif top or EdgeTags.ParallelScene in dependent_rels: return EdgeTags.ParallelScene elif dependent_rels.intersection((EdgeTags.Participant, EdgeTags.Adverbial)): return EdgeTags.Process # May be State but we can't tell else: return EdgeTags.Center
def _build_layer0(preterminals, l1, l0): # add edges to terminals according to alignments for i, parents in preterminals.items(): terminal = l0.all[i] if layer0.is_punct(terminal): tag = layer1.EdgeTags.Punctuation terminal = l1.add_punct(parents[0], terminal) terminal.attrib[LABEL_ATTRIB] = layer1.NodeTags.Punctuation else: tag = layer1.EdgeTags.Terminal for parent in parents: if parent not in terminal.parents: # avoid multiple identical edges (e.g. :polarity~e.68 -~e.68) parent.add(tag, terminal)
def _build_layer0(preterminals, l1, l0): # add edges to terminals according to alignments for i, parents in preterminals.items(): terminal = l0.all[i] if layer0.is_punct(terminal): tag = PUNCTUATION_DEP terminal = l1.add_punct(parents[0], terminal) terminal.attrib[LABEL_ATTRIB] = PUNCTUATION_LABEL del parents[1:] # avoid multiple punctuation parents, which is mostly due to alignment errors else: tag = TERMINAL_DEP for parent in parents: if parent not in terminal.parents: # avoid multiple identical edges (e.g. :polarity~e.68 -~e.68) parent.add(tag, terminal)
def add_to_l1(self, l0, l1, parent, tag, labeled, node_labels): """ Called when creating final Passage to add a new core.Node :param l0: Layer0 of the passage :param l1: Layer1 of the passage :param parent: node :param tag: edge tag to link to parent :param labeled: there is a reference passage, so keep original node IDs in the "remarks" field :param node_labels: whether to add a node label """ edge = self.outgoing[0] if len(self.outgoing) == 1 else None if self.text: # For Word terminals (Punctuation already created by add_punct for parent) if parent.node is not None: if self.node is None: self.node = parent.node.add(EdgeTags.Terminal, self.get_terminal(l0)).child elif self.node not in parent.node.children: parent.node.add(EdgeTags.Terminal, self.node) elif edge and edge.child.text and layer0.is_punct( edge.child.get_terminal(l0)): if Config().args.verify: assert tag == EdgeTags.Punctuation, "Punctuation parent %s's edge tag is %s" % ( parent.node_id, tag) assert edge.tag == EdgeTags.Terminal, "Punctuation %s's edge tag is %s" % ( self.node_id, edge.tag) if self.node is None: self.node = l1.add_punct(parent.node, edge.child.get_terminal(l0)) edge.child.node = self.node[0].child elif parent.node is not None and self.node not in parent.node.children: parent.node.add(EdgeTags.Punctuation, self.node) else: # The usual case assert self.node is None, "Trying to create the same node twice (multiple incoming primary edges): " + \ ", ".join(map(str, self.incoming)) if parent is not None and parent.label and parent.node is None: # If parent is an orphan and has a a label, parent.add_to_l1(l0, l1, None, Config().args.orphan_label, labeled, node_labels) # link to root self.node = l1.add_fnode(None if parent is None else parent.node, tag, implicit=self.implicit) if labeled: # In training self.set_node_id() if node_labels: self.set_node_label()
def break2sentences(passage, lang="en", *args, **kwargs): """ Breaks paragraphs into sentences according to the annotation. A sentence is a list of terminals which ends with a mark from SENTENCE_END_MARKS, and is also the end of a paragraph or parallel scene. :param passage: the Passage object to operate on :param lang: optional two-letter language code :return: a list of positions in the Passage, each denotes a closing Terminal of a sentence. """ del args, kwargs l1 = passage.layer(layer1.LAYER_ID) terminals = extract_terminals(passage) if not terminals: return [] if any(n.outgoing for n in l1.all): # Passage is labeled ps_ends = [ps.end_position for ps in l1.top_scenes] ps_starts = [ps.start_position for ps in l1.top_scenes] marks = [] for terminal in terminals: # Annotations doesn't always include the ending period (or other mark) # with the parallel scene it closes. Hence, if the terminal before the # mark closed the parallel scene, and this mark doesn't open a scene # in any way (hence it probably just "hangs" there), it's a sentence end if terminal.text in SENTENCE_END_MARKS and \ (terminal.position in ps_ends or (terminal.position - 1) in ps_ends and terminal.position not in ps_starts) or \ terminal.position - 1 in marks and layer0.is_punct(terminal) and not \ (terminal.text in QUOTES and terminal.text == terminals[marks[-1] - 1].text): marks.append(terminal.position) else: # Not labeled, split using spaCy annotated = get_nlp(lang=lang)([t.text for t in terminals]) marks = [span.end for span in annotated.sents] marks = sorted(set(marks + break2paragraphs(passage))) # Avoid punctuation-only sentences by picking the last punctuation symbol in each consecutive sequence if len(marks) > 1: marks = [x for x, y in zip(marks[:-1], marks[1:]) if not all(map(layer0.is_punct, terminals[x - 1:y - 1]))] + \ [marks[-1]] return marks
def add_to_l1(self, l0, l1, parent, tag, labeled, node_labels): """ Called when creating final Passage to add a new core.Node :param l0: Layer0 of the passage :param l1: Layer1 of the passage :param parent: node :param tag: edge tag to link to parent :param labeled: there is a reference passage, so keep original node IDs in the "remarks" field :param node_labels: whether to add a node label """ edge = self.outgoing[0] if len(self.outgoing) == 1 else None if self.text: # For Word terminals (Punctuation already created by add_punct for parent) if parent.node is not None: if self.node is None: self.node = parent.node.add(EdgeTags.Terminal, self.get_terminal(l0)).child elif self.node not in parent.node.children: parent.node.add(EdgeTags.Terminal, self.node) elif edge and edge.child.text and layer0.is_punct(edge.child.get_terminal(l0)): if Config().args.verify: assert tag == EdgeTags.Punctuation, "Punctuation parent %s's edge tag is %s" % (parent.node_id, tag) assert edge.tag == EdgeTags.Terminal, "Punctuation %s's edge tag is %s" % (self.node_id, edge.tag) if self.node is None: self.node = l1.add_punct(parent.node, edge.child.get_terminal(l0)) edge.child.node = self.node[0].child elif parent.node is not None and self.node not in parent.node.children: parent.node.add(EdgeTags.Punctuation, self.node) else: # The usual case assert self.node is None, "Trying to create the same node twice (multiple incoming primary edges): " + \ ", ".join(map(str, self.incoming)) if parent is not None and parent.label and parent.node is None: # If parent is an orphan and has a a label, parent.add_to_l1(l0, l1, None, Config().args.orphan_label, labeled, node_labels) # link to root self.node = l1.add_fnode(None if parent is None else parent.node, tag, implicit=self.implicit) if labeled: # In training self.set_node_id() if node_labels: self.set_node_label()
def attach_punct(l0, l1): for terminal in l0.all: if layer0.is_punct(terminal) and not terminal.incoming: l1.add_punct(nearest_parent(l0, terminal), terminal)