def __extract_sentence_spans(in_sentence_ids):
        sentence_spans = []

        span_start = 0

        for i in range(1, len(in_sentence_ids)):
            if in_sentence_ids[i] <= in_sentence_ids[i - 1]:
                sentence_spans.append(spans.Span(span_start, i - 1))
                span_start = i

        sentence_spans.append(spans.Span(span_start, len(in_sentence_ids) - 1))

        return sentence_spans
def get_relevant_parented_subtree(span, document):
    """ Get the parented fragment of the parse tree and the input span.

    Args:
        span (Span): A span in a document.
        document (CoNLLDocument): A document.

    Returns:
        nltk.ParentedTree: The parented fragment of the parse tree at the span in the
        document.
    """
    in_sentence_ids = document.in_sentence_ids[span.begin:span.end + 1]
    in_sentence_span = spans.Span(in_sentence_ids[0], in_sentence_ids[-1])

    sentence_id, sentence_span = document.get_sentence_id_and_span(span)

    sentence_tree = nltk.ParentedTree.fromstring(
        str(document.parse[sentence_id]))

    spanning_leaves = sentence_tree.treeposition_spanning_leaves(
        in_sentence_span.begin, in_sentence_span.end + 1)
    mention_subtree = sentence_tree[spanning_leaves]

    if mention_subtree in sentence_tree.leaves():
        mention_subtree = sentence_tree[spanning_leaves[:-2]]

    return mention_subtree
 def get_subtree_span(self, subtree_root, span, visited, without_serv=True):
     if subtree_root not in self.children or len(self.children[subtree_root]) == 0 or visited[subtree_root]:
         sp1 = spans.Span(subtree_root, subtree_root)
         span = span.unite(sp1)
         return span
     visited[subtree_root] = True
     for child in self.children[subtree_root]:
         if not (without_serv and self.tokens_properties[child]["SP"] in self.SERVICE_POS):
             sp2 = self.get_subtree_span(child, span, visited, without_serv=False)
             span = span.unite(sp2)
     return span
    def __init__(self, identifier, sentences, coref):
        """ Construct a document from sentence and coreference information.

        Args:
            identifier (str): A unique identifier for the document.
            sentences(list): A list of sentence information. The ith item
                contains information about the ith sentence. We assume that
                each ``sentences[i]`` is a 6-tuple
                ``tokens, pos, ner, speakers, parse, dep``, where

                * tokens (list(str)): All tokens in the sentence.
                * pos (list(str)): All part-of-speech tags in the sentence.
                * ner (list(str)): All named entity tags in the sentence (if a
                  token does not have a tag, the tag is set to NONE).
                * speakers (list(str)): All speaker ids in the sentence.
                * parse (str): A string representation of the sentence's parse
                  tree (should be readable by nltk)
                * dep (list(StanfordDependencies.CoNLL.Token): All dependencies
                  in the sentence represented as lists of tokens with label
                  information and pointers to heads.
            coref (dict(span, int)): A mapping of mention spans to their
            coreference set id.
        """
        self.identifier = identifier

        self.in_sentence_ids = []
        self.sentence_spans = []
        self.tokens = []
        self.pos = []
        self.ner = []
        self.parse = []
        self.dep = []
        self.speakers = []
        self.coref = coref

        for sentence in sentences:
            tokens, pos, ner, speakers, parse, dep = sentence

            offset = len(self.tokens)

            self.in_sentence_ids += list(range(0, len(tokens)))

            self.sentence_spans.append(
                spans.Span(offset, offset + len(tokens) - 1))

            self.tokens += tokens
            self.pos += pos
            self.ner += ner
            self.parse.append(nltk.Tree.fromstring(parse))
            self.dep.append(dep)
            self.speakers += speakers

        self.annotated_mentions = self.__get_annotated_mentions()
        self.system_mentions = []
 def get_subtree_span(self, subtree_root, span, visited, without_serv=True):
     #включаем последний флаг, если хотим убрать служебные части речи на первом уровне поддерева
     if subtree_root not in self.children or len(self.children[subtree_root]) == 0 or visited[subtree_root]:
         sp1 = spans.Span(subtree_root, subtree_root)
         span = span.unite(sp1)
         return span
     #print(subtree_root, self.children[subtree_root])
     visited[subtree_root] = True
     for child in self.children[subtree_root]:
         if not (without_serv and self.tokens_properties[child]["SyntParadigm"] in self.SERVICE_POS):
             #print(child)
             sp2 = self.get_subtree_span(child, span, visited, without_serv=False)
             span = span.unite(sp2)
     return span
    def __get_span_to_id(column):
        span_to_id = {}

        ids_to_stack = defaultdict(list)

        for i in range(0, len(column)):
            entry = column[i]

            if entry != "-":
                parallel_annotations = entry.split("|")

                for annotation in parallel_annotations:
                    if annotation.startswith("(") and annotation.endswith(")"):
                        set_id = annotation[1:-1]
                        span_to_id[spans.Span(i, i)] = int(set_id)
                    elif annotation.startswith("("):
                        set_id = annotation[1:]
                        ids_to_stack[set_id].append(i)
                    elif annotation.endswith(")"):
                        set_id = annotation[:-1]
                        span_to_id[spans.Span(ids_to_stack[set_id].pop(),
                                              i)] = int(set_id)

        return span_to_id
def __extract_system_mention_spans(document):
    mention_spans = []
    for i, sentence_span in enumerate(document.sentence_spans):
        sentence_tree = document.parse[i]

        in_sentence_spans = __extract_mention_spans_for_sentence(
            sentence_tree,
            document.ner[sentence_span.begin:sentence_span.end + 1])

        mention_spans += [
            spans.Span(sentence_span.begin + span.begin,
                       sentence_span.begin + span.end)
            for span in in_sentence_spans
        ]

    return sorted(mention_spans)
def __get_in_tree_span(parented_tree):
    start = 0

    current_tree = parented_tree

    while current_tree.parent() is not None:
        for child in current_tree.parent():
            if child == current_tree:
                break
            else:
                start += len(child.leaves())

        current_tree = current_tree.parent()

    end = start + len(parented_tree.leaves()) - 1

    return spans.Span(start, end)
def __get_span_from_ner(pos, ner):
    i = 0
    spans_from_ner = []
    while i < len(ner):
        current_tag = ner[i]
        if current_tag != "NONE":
            start = i

            while i + 1 < len(ner) and ner[i + 1] != "NONE" and ner[i] == ner[
                    i + 1]:
                i += 1

            if i + 1 < len(pos) and pos[i + 1] == "POS":
                i += 1

            spans_from_ner.append(spans.Span(start, i))

        i += 1

    return sorted(spans_from_ner)
示例#10
0
    def adjust_head_for_nam(tokens, pos, ner_type, in_mention_span_old_head,
                            old_head):
        """
        Adjust head for proper names via heuristics.

        Based on heuristics depending on the named entity type (person,
        organization, ...) and part-of-speech tags, adjust the head of a
        named entity mention to a meaningful extent useful for coreference
        resolution.

        For example, for the mention "Khan Younes in Southern Gaza Strip",
        this function will compute "Khan Younes" as the head.

        Args:
            tokens (list(str)): The tokens of the mention.
            pos (list(str)): The part-of-speech tags of the mention.
            ner_type (str): The named entity type of the mention. Should be
                one of PERSON, ORG, GPE, FAC, NORP, PRODUCT, EVENT, MONEY,
                WORK_OF_ART, LOC, LAW, LANGUAGE, DATE, TIME, ORDINAL,
                CARDINAL, QUANTITY, PERCENT or NONE.
            in_mention_span_old_head (spans.Span): The in-mention span of the
                old head.
            old_head (list(str)): The tokens of the old head.

        Returns:
            (Span, list(str)): The in-mention span of the adjusted head and
                the tokens of the adjusted head.
        """
        # TODO: get rid of this ugly hack
        if len(pos) == 0:
            return spans.Span(0, 0), "NOHEAD"

        stop_regex = re.compile("CC|,|\.|:|;|V.*|IN|W.*|ADVP|NN$")

        if re.match(
                "ORG.*|GPE.*|FAC.*|NORP.*|PRODUCT|EVENT|MONEY|" +
                "WORK_OF_ART|LOC.*|LAW|LANGUAGE", ner_type):
            start_regex = re.compile("NN(S)?|NNP(S)?")
            stop_regex = re.compile("V.*|IN|W.*|ADVP|,|-LRB-")
        elif ner_type == "PERSON":
            start_regex = re.compile("NN(S)?|NNP(S)?")
            stop_regex = re.compile("IN|CC|,|\.|:|;|V.*|W.*|-LRB-")
        elif re.match("DATE|TIME", ner_type):
            start_regex = re.compile("NN(S)?|NNP(S)?|CD")
        elif re.match("ORDINAL", ner_type):
            start_regex = re.compile("NN|JJ|RB")
        elif re.match("CARDINAL", ner_type):
            start_regex = re.compile("CD")
        elif re.match("QUANTITY|PERCENT", ner_type):
            start_regex = re.compile("CD|JJ|NN")
        elif ner_type == "NONE":
            start_regex = re.compile("NN(S)?|NNP(S)?|CD")
        else:
            logger.warning("No head adjustment rule defined for NER class " +
                           ner_type + ".")
            return in_mention_span_old_head, old_head

        head_start = -1

        position = 0

        for i in range(0, len(tokens)):
            position = i
            if head_start == -1 and start_regex.match(pos[i]):
                head_start = i
            elif head_start >= 0 and stop_regex.match(pos[i]):
                return spans.Span(head_start, i - 1), tokens[head_start:i]

        if head_start == -1:
            head_start = 0

        if pos[position] == "POS" and position == len(pos) - 1:
            position -= 1

        return spans.Span(head_start,
                          position), tokens[head_start:position + 1]
def compute_head_information(attributes):
    """ Compute the head of the mention.

    Args:
        attributes (dict(str, object)): Attributes of the mention, must contain
            values for "tokens", "parse_tree", "pos", "ner", "is_apposition"

    Returns:
        (list(str), Span, int): The head, the head span (in the document) and
        the starting index of the head (in the mention).
    """
    mention_subtree = attributes["parse_tree"]

    head_finder = head_finders.HeadFinder()
    head_index = 0
    head = [attributes["tokens"][0]]

    if len(mention_subtree.leaves()) == len(attributes["tokens"]):
        head_tree = head_finder.get_head(mention_subtree)
        head_index = get_head_index(head_tree, mention_subtree.pos())
        head = [head_tree[0]]

    in_mention_span = spans.Span(head_index, head_index)

    if attributes["pos"][head_index].startswith("NNP"):
        in_mention_span, head = \
            head_finders.HeadFinder.adjust_head_for_nam(
                attributes["tokens"],
                attributes["pos"],
                attributes["ner"][head_index],
                in_mention_span,
                head)

    # proper name mention: head index last word of head
    # (e.g. "Obama" in "Barack Obama")
    head_index = in_mention_span.end

    # special handling for appositions
    if attributes["is_apposition"]:
        # "Secretary of State Madeleine Albright"
        # => take "Madeleine Albright" as head
        if len(mention_subtree) == 2:
            head_tree = mention_subtree[1]
            head = head_tree.leaves()
            in_mention_span = spans.Span(len(mention_subtree[0].leaves()),
                                         len(attributes["tokens"]) - 1)
            head_index = in_mention_span.end
        else:
            start = 0
            for child in mention_subtree:
                if __head_pos_starts_with(child, "NNP"):
                    end = min([
                        start + len(child.leaves()),
                        len(attributes["tokens"])
                    ])
                    head_index = end - 1
                    in_mention_span, head = \
                        head_finders.HeadFinder.adjust_head_for_nam(
                            attributes["tokens"][start:end],
                            attributes["pos"][start:end],
                            attributes["ner"][head_index],
                            in_mention_span,
                            head)
                    break
                start += len(child.leaves())

    return head, in_mention_span, head_index
示例#12
0
    def from_document(span, document, first_in_gold_entity=False):
        """
        Create a mention from a span in a document.

        All attributes of the mention are computed from the linguistic
        information found in the document. For information about the
        attributes, see the class documentation.

        Args:
            document (CoNLLDocument): The document the mention belongs to.
            span (Span): The span of the mention in the document.

        Returns:
            Mention: A mention extracted from the input span in the input
            document.
        """

        i, sentence_span = document.get_sentence_id_and_span(span)

        subtree = mention_property_computer.get_relevant_parented_subtree(span, document)

        attributes = {
            "tokens": document.tokens[span.begin:span.end + 1],
            "pos": document.pos[span.begin:span.end + 1],
            "ner": document.ner[span.begin:span.end + 1],
            "sentence_id": i,
            "parse_tree": nltk.Tree.fromstring(str(subtree)),
            "parent_parse_tree": nltk.Tree.fromstring(str(subtree.parent())),
            "speaker": document.speakers[span.begin],
            "antecedent": None,
            "set_id": None,
            "first_in_gold_entity": first_in_gold_entity
        }


        if span.begin > 0:
            attributes["preceding_token"] = document.tokens[span.begin - 1]

        if span.end + 1 < len(document.tokens):
            attributes["next_token"] = document.tokens[span.end + 1]

        if span.end + 3 < len(document.tokens):
            attributes["next_three_tokens"] = document.tokens[span.end:span.end + 3]

        #if Document is SemanticCoNLLDocument then:
        if hasattr(document, 'compreno_sem_class'):
            attributes["compreno_sem_class"] = document.compreno_sem_class[span.begin:span.end + 1]
            if document.in_sentence_ids[span.begin] > 0:
                attributes["compreno_left_neighbor_sem_class"] = document.compreno_sem_class[span.begin - 1]
            if span.end + 1 < len(document.tokens) and document.in_sentence_ids[span.end + 1] > 0:
                attributes["compreno_right_neighbor_sem_class"] = document.compreno_sem_class[span.end + 1]
        if hasattr(document, 'compreno_surf_slot'):
            attributes["compreno_surf_slot"] = document.compreno_surf_slot[span.begin:span.end + 1]
        if hasattr(document, 'compreno_sem_path'):
            attributes["compreno_sem_path"] = document.compreno_sem_path[span.begin:span.end + 1]

        if span in document.coref:
            attributes["annotated_set_id"] = document.coref[span]
        else:
            attributes["annotated_set_id"] = None

        attributes["is_apposition"] = \
            mention_property_computer.is_apposition(attributes)

        attributes["grammatical_function"] = \
            mention_property_computer.get_grammatical_function(attributes)

        (head, in_mention_span, head_index) = \
            mention_property_computer.compute_head_information(attributes)

        attributes["head"] = head
        attributes["head_span"] = spans.Span(
            span.begin + in_mention_span.begin,
            span.begin + in_mention_span.end
        )
        attributes["head_index"] = head_index

        attributes["type"] = mention_property_computer.get_type(attributes)
        attributes["fine_type"] = mention_property_computer.get_fine_type(
            attributes)

        if attributes["type"] == "PRO":
            attributes["citation_form"] = \
                mention_property_computer.get_citation_form(
                    attributes)

        attributes["number"] = \
            mention_property_computer.compute_number(attributes)
        attributes["gender"] = \
            mention_property_computer.compute_gender(attributes)

        attributes["semantic_class"] = \
            mention_property_computer.compute_semantic_class(attributes)

        attributes["head_as_lowercase_string"] = " ".join(attributes[
            "head"]).lower()

        attributes["tokens_as_lowercase_string"] = " ".join(attributes[
            "tokens"]).lower()

        dep_tree = document.dep[i]

        index = span.begin + head_index - sentence_span.begin

        governor_id = dep_tree[index].head - 1

        if governor_id == -1:
            attributes["governor"] = "NONE"
        else:
            attributes["governor"] = dep_tree[governor_id].form.lower()

        attributes["ancestry"] = Mention._get_ancestry(dep_tree, index)

        attributes["deprel"] = dep_tree[index].deprel


        return Mention(document, span, attributes)