def __extract_sentence_spans(in_sentence_ids): sentence_spans = [] span_start = 0 for i in range(1, len(in_sentence_ids)): if in_sentence_ids[i] <= in_sentence_ids[i - 1]: sentence_spans.append(spans.Span(span_start, i - 1)) span_start = i sentence_spans.append(spans.Span(span_start, len(in_sentence_ids) - 1)) return sentence_spans
def get_relevant_parented_subtree(span, document): """ Get the parented fragment of the parse tree and the input span. Args: span (Span): A span in a document. document (CoNLLDocument): A document. Returns: nltk.ParentedTree: The parented fragment of the parse tree at the span in the document. """ in_sentence_ids = document.in_sentence_ids[span.begin:span.end + 1] in_sentence_span = spans.Span(in_sentence_ids[0], in_sentence_ids[-1]) sentence_id, sentence_span = document.get_sentence_id_and_span(span) sentence_tree = nltk.ParentedTree.fromstring( str(document.parse[sentence_id])) spanning_leaves = sentence_tree.treeposition_spanning_leaves( in_sentence_span.begin, in_sentence_span.end + 1) mention_subtree = sentence_tree[spanning_leaves] if mention_subtree in sentence_tree.leaves(): mention_subtree = sentence_tree[spanning_leaves[:-2]] return mention_subtree
def get_subtree_span(self, subtree_root, span, visited, without_serv=True): if subtree_root not in self.children or len(self.children[subtree_root]) == 0 or visited[subtree_root]: sp1 = spans.Span(subtree_root, subtree_root) span = span.unite(sp1) return span visited[subtree_root] = True for child in self.children[subtree_root]: if not (without_serv and self.tokens_properties[child]["SP"] in self.SERVICE_POS): sp2 = self.get_subtree_span(child, span, visited, without_serv=False) span = span.unite(sp2) return span
def __init__(self, identifier, sentences, coref): """ Construct a document from sentence and coreference information. Args: identifier (str): A unique identifier for the document. sentences(list): A list of sentence information. The ith item contains information about the ith sentence. We assume that each ``sentences[i]`` is a 6-tuple ``tokens, pos, ner, speakers, parse, dep``, where * tokens (list(str)): All tokens in the sentence. * pos (list(str)): All part-of-speech tags in the sentence. * ner (list(str)): All named entity tags in the sentence (if a token does not have a tag, the tag is set to NONE). * speakers (list(str)): All speaker ids in the sentence. * parse (str): A string representation of the sentence's parse tree (should be readable by nltk) * dep (list(StanfordDependencies.CoNLL.Token): All dependencies in the sentence represented as lists of tokens with label information and pointers to heads. coref (dict(span, int)): A mapping of mention spans to their coreference set id. """ self.identifier = identifier self.in_sentence_ids = [] self.sentence_spans = [] self.tokens = [] self.pos = [] self.ner = [] self.parse = [] self.dep = [] self.speakers = [] self.coref = coref for sentence in sentences: tokens, pos, ner, speakers, parse, dep = sentence offset = len(self.tokens) self.in_sentence_ids += list(range(0, len(tokens))) self.sentence_spans.append( spans.Span(offset, offset + len(tokens) - 1)) self.tokens += tokens self.pos += pos self.ner += ner self.parse.append(nltk.Tree.fromstring(parse)) self.dep.append(dep) self.speakers += speakers self.annotated_mentions = self.__get_annotated_mentions() self.system_mentions = []
def get_subtree_span(self, subtree_root, span, visited, without_serv=True): #включаем последний флаг, если хотим убрать служебные части речи на первом уровне поддерева if subtree_root not in self.children or len(self.children[subtree_root]) == 0 or visited[subtree_root]: sp1 = spans.Span(subtree_root, subtree_root) span = span.unite(sp1) return span #print(subtree_root, self.children[subtree_root]) visited[subtree_root] = True for child in self.children[subtree_root]: if not (without_serv and self.tokens_properties[child]["SyntParadigm"] in self.SERVICE_POS): #print(child) sp2 = self.get_subtree_span(child, span, visited, without_serv=False) span = span.unite(sp2) return span
def __get_span_to_id(column): span_to_id = {} ids_to_stack = defaultdict(list) for i in range(0, len(column)): entry = column[i] if entry != "-": parallel_annotations = entry.split("|") for annotation in parallel_annotations: if annotation.startswith("(") and annotation.endswith(")"): set_id = annotation[1:-1] span_to_id[spans.Span(i, i)] = int(set_id) elif annotation.startswith("("): set_id = annotation[1:] ids_to_stack[set_id].append(i) elif annotation.endswith(")"): set_id = annotation[:-1] span_to_id[spans.Span(ids_to_stack[set_id].pop(), i)] = int(set_id) return span_to_id
def __extract_system_mention_spans(document): mention_spans = [] for i, sentence_span in enumerate(document.sentence_spans): sentence_tree = document.parse[i] in_sentence_spans = __extract_mention_spans_for_sentence( sentence_tree, document.ner[sentence_span.begin:sentence_span.end + 1]) mention_spans += [ spans.Span(sentence_span.begin + span.begin, sentence_span.begin + span.end) for span in in_sentence_spans ] return sorted(mention_spans)
def __get_in_tree_span(parented_tree): start = 0 current_tree = parented_tree while current_tree.parent() is not None: for child in current_tree.parent(): if child == current_tree: break else: start += len(child.leaves()) current_tree = current_tree.parent() end = start + len(parented_tree.leaves()) - 1 return spans.Span(start, end)
def __get_span_from_ner(pos, ner): i = 0 spans_from_ner = [] while i < len(ner): current_tag = ner[i] if current_tag != "NONE": start = i while i + 1 < len(ner) and ner[i + 1] != "NONE" and ner[i] == ner[ i + 1]: i += 1 if i + 1 < len(pos) and pos[i + 1] == "POS": i += 1 spans_from_ner.append(spans.Span(start, i)) i += 1 return sorted(spans_from_ner)
def adjust_head_for_nam(tokens, pos, ner_type, in_mention_span_old_head, old_head): """ Adjust head for proper names via heuristics. Based on heuristics depending on the named entity type (person, organization, ...) and part-of-speech tags, adjust the head of a named entity mention to a meaningful extent useful for coreference resolution. For example, for the mention "Khan Younes in Southern Gaza Strip", this function will compute "Khan Younes" as the head. Args: tokens (list(str)): The tokens of the mention. pos (list(str)): The part-of-speech tags of the mention. ner_type (str): The named entity type of the mention. Should be one of PERSON, ORG, GPE, FAC, NORP, PRODUCT, EVENT, MONEY, WORK_OF_ART, LOC, LAW, LANGUAGE, DATE, TIME, ORDINAL, CARDINAL, QUANTITY, PERCENT or NONE. in_mention_span_old_head (spans.Span): The in-mention span of the old head. old_head (list(str)): The tokens of the old head. Returns: (Span, list(str)): The in-mention span of the adjusted head and the tokens of the adjusted head. """ # TODO: get rid of this ugly hack if len(pos) == 0: return spans.Span(0, 0), "NOHEAD" stop_regex = re.compile("CC|,|\.|:|;|V.*|IN|W.*|ADVP|NN$") if re.match( "ORG.*|GPE.*|FAC.*|NORP.*|PRODUCT|EVENT|MONEY|" + "WORK_OF_ART|LOC.*|LAW|LANGUAGE", ner_type): start_regex = re.compile("NN(S)?|NNP(S)?") stop_regex = re.compile("V.*|IN|W.*|ADVP|,|-LRB-") elif ner_type == "PERSON": start_regex = re.compile("NN(S)?|NNP(S)?") stop_regex = re.compile("IN|CC|,|\.|:|;|V.*|W.*|-LRB-") elif re.match("DATE|TIME", ner_type): start_regex = re.compile("NN(S)?|NNP(S)?|CD") elif re.match("ORDINAL", ner_type): start_regex = re.compile("NN|JJ|RB") elif re.match("CARDINAL", ner_type): start_regex = re.compile("CD") elif re.match("QUANTITY|PERCENT", ner_type): start_regex = re.compile("CD|JJ|NN") elif ner_type == "NONE": start_regex = re.compile("NN(S)?|NNP(S)?|CD") else: logger.warning("No head adjustment rule defined for NER class " + ner_type + ".") return in_mention_span_old_head, old_head head_start = -1 position = 0 for i in range(0, len(tokens)): position = i if head_start == -1 and start_regex.match(pos[i]): head_start = i elif head_start >= 0 and stop_regex.match(pos[i]): return spans.Span(head_start, i - 1), tokens[head_start:i] if head_start == -1: head_start = 0 if pos[position] == "POS" and position == len(pos) - 1: position -= 1 return spans.Span(head_start, position), tokens[head_start:position + 1]
def compute_head_information(attributes): """ Compute the head of the mention. Args: attributes (dict(str, object)): Attributes of the mention, must contain values for "tokens", "parse_tree", "pos", "ner", "is_apposition" Returns: (list(str), Span, int): The head, the head span (in the document) and the starting index of the head (in the mention). """ mention_subtree = attributes["parse_tree"] head_finder = head_finders.HeadFinder() head_index = 0 head = [attributes["tokens"][0]] if len(mention_subtree.leaves()) == len(attributes["tokens"]): head_tree = head_finder.get_head(mention_subtree) head_index = get_head_index(head_tree, mention_subtree.pos()) head = [head_tree[0]] in_mention_span = spans.Span(head_index, head_index) if attributes["pos"][head_index].startswith("NNP"): in_mention_span, head = \ head_finders.HeadFinder.adjust_head_for_nam( attributes["tokens"], attributes["pos"], attributes["ner"][head_index], in_mention_span, head) # proper name mention: head index last word of head # (e.g. "Obama" in "Barack Obama") head_index = in_mention_span.end # special handling for appositions if attributes["is_apposition"]: # "Secretary of State Madeleine Albright" # => take "Madeleine Albright" as head if len(mention_subtree) == 2: head_tree = mention_subtree[1] head = head_tree.leaves() in_mention_span = spans.Span(len(mention_subtree[0].leaves()), len(attributes["tokens"]) - 1) head_index = in_mention_span.end else: start = 0 for child in mention_subtree: if __head_pos_starts_with(child, "NNP"): end = min([ start + len(child.leaves()), len(attributes["tokens"]) ]) head_index = end - 1 in_mention_span, head = \ head_finders.HeadFinder.adjust_head_for_nam( attributes["tokens"][start:end], attributes["pos"][start:end], attributes["ner"][head_index], in_mention_span, head) break start += len(child.leaves()) return head, in_mention_span, head_index
def from_document(span, document, first_in_gold_entity=False): """ Create a mention from a span in a document. All attributes of the mention are computed from the linguistic information found in the document. For information about the attributes, see the class documentation. Args: document (CoNLLDocument): The document the mention belongs to. span (Span): The span of the mention in the document. Returns: Mention: A mention extracted from the input span in the input document. """ i, sentence_span = document.get_sentence_id_and_span(span) subtree = mention_property_computer.get_relevant_parented_subtree(span, document) attributes = { "tokens": document.tokens[span.begin:span.end + 1], "pos": document.pos[span.begin:span.end + 1], "ner": document.ner[span.begin:span.end + 1], "sentence_id": i, "parse_tree": nltk.Tree.fromstring(str(subtree)), "parent_parse_tree": nltk.Tree.fromstring(str(subtree.parent())), "speaker": document.speakers[span.begin], "antecedent": None, "set_id": None, "first_in_gold_entity": first_in_gold_entity } if span.begin > 0: attributes["preceding_token"] = document.tokens[span.begin - 1] if span.end + 1 < len(document.tokens): attributes["next_token"] = document.tokens[span.end + 1] if span.end + 3 < len(document.tokens): attributes["next_three_tokens"] = document.tokens[span.end:span.end + 3] #if Document is SemanticCoNLLDocument then: if hasattr(document, 'compreno_sem_class'): attributes["compreno_sem_class"] = document.compreno_sem_class[span.begin:span.end + 1] if document.in_sentence_ids[span.begin] > 0: attributes["compreno_left_neighbor_sem_class"] = document.compreno_sem_class[span.begin - 1] if span.end + 1 < len(document.tokens) and document.in_sentence_ids[span.end + 1] > 0: attributes["compreno_right_neighbor_sem_class"] = document.compreno_sem_class[span.end + 1] if hasattr(document, 'compreno_surf_slot'): attributes["compreno_surf_slot"] = document.compreno_surf_slot[span.begin:span.end + 1] if hasattr(document, 'compreno_sem_path'): attributes["compreno_sem_path"] = document.compreno_sem_path[span.begin:span.end + 1] if span in document.coref: attributes["annotated_set_id"] = document.coref[span] else: attributes["annotated_set_id"] = None attributes["is_apposition"] = \ mention_property_computer.is_apposition(attributes) attributes["grammatical_function"] = \ mention_property_computer.get_grammatical_function(attributes) (head, in_mention_span, head_index) = \ mention_property_computer.compute_head_information(attributes) attributes["head"] = head attributes["head_span"] = spans.Span( span.begin + in_mention_span.begin, span.begin + in_mention_span.end ) attributes["head_index"] = head_index attributes["type"] = mention_property_computer.get_type(attributes) attributes["fine_type"] = mention_property_computer.get_fine_type( attributes) if attributes["type"] == "PRO": attributes["citation_form"] = \ mention_property_computer.get_citation_form( attributes) attributes["number"] = \ mention_property_computer.compute_number(attributes) attributes["gender"] = \ mention_property_computer.compute_gender(attributes) attributes["semantic_class"] = \ mention_property_computer.compute_semantic_class(attributes) attributes["head_as_lowercase_string"] = " ".join(attributes[ "head"]).lower() attributes["tokens_as_lowercase_string"] = " ".join(attributes[ "tokens"]).lower() dep_tree = document.dep[i] index = span.begin + head_index - sentence_span.begin governor_id = dep_tree[index].head - 1 if governor_id == -1: attributes["governor"] = "NONE" else: attributes["governor"] = dep_tree[governor_id].form.lower() attributes["ancestry"] = Mention._get_ancestry(dep_tree, index) attributes["deprel"] = dep_tree[index].deprel return Mention(document, span, attributes)