예제 #1
0
파일: gvdb.py 프로젝트: qolina/arglinking
    def _enumerate_spans(self, sentences, max_span_width, text, labels_list):
        """
        Enumerates all spans
        """
        spans: List[Field] = []
        sentence_offset = 0
        python_spans = []
        sentence_ids = []
        for sentence_id, sentence in enumerate(sentences):
            for start, end in enumerate_spans(sentence,
                                              offset=sentence_offset,
                                              max_span_width=max_span_width):
                spans.append(SpanField(start, end, text))
                python_spans.append((start, end))
                sentence_ids.append(
                    LabelField(sentence_id,
                               label_namespace="sentence_id_tags",
                               skip_indexing=True))
            sentence_offset += len(sentence)
        python_span_dict = {span: i for i, span in enumerate(python_spans)}
        labels_idx = [python_span_dict[span] for span in labels_list]
        spans_list_field = ListField(spans)
        labels_index_field = ListField(
            [IndexField(idx, spans_list_field) for idx in labels_idx])

        sentence_id_field = ListField(sentence_ids)
        return spans_list_field, labels_index_field, sentence_id_field
예제 #2
0
파일: gvdb.py 프로젝트: qolina/arglinking
    def _get_unique_spans(self, spans, text, sentence_ids):
        # span_indices are effectively the labels
        spans, span_indices = np.unique(np.array(spans),
                                        return_inverse=True,
                                        axis=0)
        spans_list_field = ListField(
            [SpanField(int(s[0]), int(s[1]), text) for s in spans])
        labels_index_field = ListField(
            [IndexField(int(idx), spans_list_field) for idx in span_indices])

        sent_ids = [sentence_ids[(s[0], s[1])] for s in spans]
        sentence_id_field = ListField([
            LabelField(si,
                       label_namespace="sentence_id_tags",
                       skip_indexing=True) for si in sent_ids
        ])

        return spans_list_field, labels_index_field, sentence_id_field
예제 #3
0
    def _enumerate_spans(self,
                         sentences,
                         max_span_width,
                         text,
                         labels_list,
                         possible_args=None):
        """
        Enumerates all spans
        """
        spans: List[Field] = []
        sentence_offset = 0
        python_spans = []
        sentence_ids = []
        for sentence_id, sentence in enumerate(sentences):
            for start, end in enumerate_spans(sentence,
                                              offset=sentence_offset,
                                              max_span_width=max_span_width):
                if (possible_args is None) or ((start, end) in possible_args):
                    spans.append(SpanField(start, end, text))
                    python_spans.append((start, end))
                    sentence_ids.append(
                        LabelField(sentence_id,
                                   label_namespace="sentence_id_tags",
                                   skip_indexing=True))
            sentence_offset += len(sentence)
        python_span_dict = {span: i for i, span in enumerate(python_spans)}
        labels_idx = [
            python_span_dict.get(span, None) for span in labels_list
        ]  # given spans might cross sentence boundaries (e.g., due to parser errors), so remove them to avoid issues in the model
        labels_idx = [i for i in labels_idx if i != None]
        spans_list_field = ListField(spans)
        labels_index_field = ListField(
            [IndexField(idx, spans_list_field) for idx in labels_idx])

        sentence_id_field = ListField(sentence_ids)
        return spans_list_field, labels_index_field, sentence_id_field
예제 #4
0
파일: gvdb.py 프로젝트: qolina/arglinking
    def text_to_instance(
            self,  # type: ignore
            sentences: List[List[str]],
            sentence_start_offsets: List[int],
            doc_link_info:
        Optional[List[Dict[Tuple[int, int], List[Tuple[
            str, Tuple[int,
                       int],
            str]]]]] = None,  # [{(event_type_start, event_type_end): [(slot, (value_start, value_end))]}]
            genre: Optional[str] = None,
            document_id: Optional[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        max_sent_len = max(len(s) for s in sentences)
        padded_sentences = [[
            self._normalize_word(sentence[i]) if i < len(sentence) else "UNK"
            for i in range(max_sent_len)
        ] for sentence in sentences]
        flattened_sentences = [
            word for sentence in padded_sentences for word in sentence
        ]

        text_lens = [len(s) for s in sentences]
        text_lens_idx = [list(range(max_sent_len)) for _ in sentences]
        text_lens_mask = [[int(idx < sent_len) for idx in idxs]
                          for sent_len, idxs in zip(text_lens, text_lens_idx)]
        sentence_offsets = [0]
        for tl in text_lens:
            sentence_offsets.append(sentence_offsets[-1] + tl)
        sentence_offsets = sentence_offsets[:-1]
        sentence_offsets.append(
            float('inf'))  # sentinel/padding for finding sentence id from span

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        if genre:
            genre_field = LabelField(self._genres[genre], skip_indexing=True)
        else:
            genre_field = LabelField(0, skip_indexing=True)

        # pack up data as: List[str], List[List[SpanField]]
        #                       |               |
        #                    labels      (trigger,arg) span pairs
        trigger_arg_pairs = [] if doc_link_info is not None else None
        triggers = [] if doc_link_info is not None else None
        args = [] if doc_link_info is not None else None
        roles = [] if doc_link_info is not None else None
        gold_strings = [] if doc_link_info is not None else None
        unique_roles = set() if doc_link_info is not None else None
        trigger_sent_ids = {} if doc_link_info is not None else None
        arg_sent_ids = {} if doc_link_info is not None else None
        skipped_args = 0
        if doc_link_info is not None:
            trigger_arg_pairs = []
            seen_trigger_spans = set()
            seen_arg_spans = set()
            for frame_data in doc_link_info:
                for trigger_span, argument_data in frame_data.items():
                    trigger_sentence_id = self._get_sentence_id(
                        trigger_span, sentence_offsets)
                    # event_type is allowed to cross sentence boundaries, since the event_type is the entire document
                    trigger_sent_ids[trigger_span] = trigger_sentence_id
                    for (role, argument_span, gold_string) in argument_data:
                        trigger_arg_pairs.append(
                            [trigger_span, argument_span, role, gold_string])
                        arg_sentence_id = self._get_sentence_id(
                            argument_span, sentence_offsets)

                        if arg_sentence_id == CROSSES_SENTENCE_BOUNDARY:
                            skipped_args += 1
                            continue

                        # If not using gold, need to make sure the labels
                        # we care about exist in the enumerated spans
                        span_size_condition = True
                        if (not self._use_gold_triggers):
                            span_size_condition &= (
                                trigger_span[1] - trigger_span[0] <
                                self._max_trigger_span_width)
                        if (not self._use_gold_arguments):
                            span_size_condition &= (
                                argument_span[1] - argument_span[0] <
                                self._max_arg_span_width)
                        if span_size_condition:
                            roles.append(LabelField(role))
                            unique_roles.add(role)
                            triggers.append(trigger_span)
                            args.append(argument_span)

                            arg_sent_ids[argument_span] = arg_sentence_id

        metadata: Dict[str, Any] = dict()
        metadata["annotation_kind"] = ""  # to play nicely with RAMS
        metadata["sentences"] = sentences
        metadata["sentence_start_offsets"] = sentence_start_offsets
        metadata["text_lens"] = np.array(text_lens_mask)
        metadata["doc_id"] = document_id
        metadata["has_gold_targets"] = doc_link_info is not None
        metadata["data_path"] = self._file_path

        # Create all triggers and all args
        # Since the event_type is the entire document (and therefore crosses boundaries), we replace CROSSES_SENTENCE_BOUNDARY with sentence id of 0 to ensure we don't get out-of-bounds issues
        trigger_sent_ids = {
            k: (lambda v: v if v != CROSSES_SENTENCE_BOUNDARY else 0)(v)
            for k, v in trigger_sent_ids.items()
        }
        trigger_spans_field, trigger_idx_field, trigger_sentence_id_field = self._get_unique_spans(
            triggers, text_field, trigger_sent_ids)
        if self._use_gold_arguments:
            arg_spans_field, arg_idx_field, arg_sentence_id_field = self._get_unique_spans(
                args, text_field, arg_sent_ids)
        else:
            arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans(
                sentences, self._max_arg_span_width, text_field, args)

        (metadata['triggers'], metadata['arguments'], metadata['roles'],
         metadata['gold_strings']) = zip(*trigger_arg_pairs)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "genre": genre_field,
            "metadata": metadata_field,
            "all_triggers": trigger_spans_field,
            "all_args": arg_spans_field,
            "all_trigger_sentence_ids": trigger_sentence_id_field,
            "all_arg_sentence_ids": arg_sentence_id_field,
            "target_roles": ListField(roles),
            "target_trigger_idx": trigger_idx_field,
            "target_arg_idx": arg_idx_field,
        }

        return Instance(fields)
예제 #5
0
    def text_to_instance(
        self,  # type: ignore
        sentences: List[List[str]],
        sentence_start_offsets: List[int],
        genre: str,
        document_id: str,
        possible_triggers,  # all given spans, regardless of whether they participate in a link
        possible_args,  # all given spans, regardless of whether they participate in a link
        doc_trigger_arg_info: Optional[Dict[Tuple[int, int],
                                            List[Tuple[str,
                                                       Tuple[int,
                                                             int]]]]] = None
    ) -> Instance:
        # pylint: disable=arguments-differ
        max_sent_len = max(len(s) for s in sentences)
        padded_sentences = [[
            sentence[i] if i < len(sentence) else "UNK"
            for i in range(max_sent_len)
        ] for sentence in sentences]
        flattened_sentences = [
            word for sentence in padded_sentences for word in sentence
        ]

        text_lens = [len(s) for s in sentences]
        text_lens_idx = [list(range(max_sent_len)) for _ in sentences]
        text_lens_mask = [[int(idx < sent_len) for idx in idxs]
                          for sent_len, idxs in zip(text_lens, text_lens_idx)]
        sentence_offsets = [0]
        for tl in text_lens:
            sentence_offsets.append(sentence_offsets[-1] + tl)
        sentence_offsets = sentence_offsets[:-1]
        assert sentence_offsets == sentence_start_offsets
        sentence_offsets.append(
            float('inf'))  # sentinel/padding for finding sentence id from span

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        if genre:
            genre_field = LabelField(self._genres[genre],
                                     label_namespace="genre_labels",
                                     skip_indexing=True)
        else:
            genre_field = LabelField(0,
                                     label_namespace="genre_labels",
                                     skip_indexing=True)

        # pack up data as: List[str], List[List[SpanField]]
        #                       |               |
        #                    labels      (trig,arg) span pairs
        has_links = bool(doc_trigger_arg_info)
        trigger_arg_pairs = [] if has_links else None
        triggers = []
        args = []
        roles = [] if has_links else None
        trigger_sent_ids = {}
        arg_sent_ids = {}

        for p in possible_triggers:
            trigger_sent_ids[p] = self._get_sentence_id(p, sentence_offsets)
        for a in possible_args:
            arg_sent_ids[a] = self._get_sentence_id(a, sentence_offsets)

        if has_links:
            trigger_arg_pairs = []
            seen_trigger_spans = set()
            seen_arg_spans = set()
            for trigger_span, argument_data in doc_trigger_arg_info.items():
                for (argument_span, role) in argument_data:
                    trigger_arg_pairs.append(
                        [trigger_span, argument_span, role])

                    # If not using gold, need to make sure the labels
                    # we care about exist in the enumerated spans
                    span_size_condition = True
                    if (not self._use_gold_triggers):
                        span_size_condition &= (
                            trigger_span[1] - trigger_span[0] <
                            self._max_trigger_span_width)
                    if (not self._use_gold_arguments):
                        span_size_condition &= (
                            argument_span[1] - argument_span[0] <
                            self._max_arg_span_width)
                    if span_size_condition:
                        roles.append(LabelField(role,
                                                label_namespace="labels"))
                        triggers.append(trigger_span)
                        args.append(argument_span)

        else:
            # Just get triggers and arguments
            for trigger_span in possible_triggers:
                span_size_condition = True
                if (not self._use_gold_triggers):
                    span_size_condition &= (trigger_span[1] - trigger_span[0] <
                                            self._max_trigger_span_width)
                if span_size_condition:
                    triggers.append(trigger_span)
            for argument_span in possible_args:
                span_size_condition = True
                if (not self._use_gold_arguments):
                    span_size_condition &= (argument_span[1] - argument_span[0]
                                            < self._max_arg_span_width)
                if span_size_condition:
                    args.append(argument_span)

        metadata: Dict[str, Any] = dict()
        metadata["sentences"] = sentences
        metadata["sentence_start_offsets"] = sentence_start_offsets
        metadata["text_lens"] = np.array(text_lens_mask)
        metadata["doc_id"] = document_id
        metadata["has_gold_targets"] = has_links
        metadata["data_path"] = self._file_path
        metadata["language"] = self._language
        metadata["annotation_kind"] = self._annotation_mode

        # Create all triggers and all args
        if self._use_gold_triggers:
            trigger_spans_field, trigger_idx_field, trigger_sentence_id_field = self._get_unique_spans(
                triggers, text_field, trigger_sent_ids)
        else:
            trigger_spans_field, trigger_idx_field, trigger_sentence_id_field = self._enumerate_spans(
                sentences, self._max_trigger_span_width, text_field, triggers)
        if self._use_gold_arguments:
            arg_spans_field, arg_idx_field, arg_sentence_id_field = self._get_unique_spans(
                args, text_field, arg_sent_ids)
        else:
            #arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans(sentences, self._max_arg_span_width, text_field, args, possible_args=possible_args)  #### EXPERIMENT: USE SYNTACTIC SPANS
            arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans(
                sentences, self._max_arg_span_width, text_field,
                args)  #### EXPERIMENT: USE ALL SPANS

        roles_field = ListField(roles) if roles else None

        metadata['triggers'] = triggers
        metadata['arguments'] = args
        if trigger_arg_pairs:
            metadata['roles'] = list(zip(*trigger_arg_pairs))[2]

        f_sentences = [word for sentence in sentences for word in sentence]
        gold_strings = [
            " ".join(f_sentences[arg[0]:arg[1] + 1])
            for arg in metadata['arguments']
        ]
        metadata['gold_strings'] = gold_strings

        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "genre": genre_field,
            "metadata": metadata_field,
            "all_triggers": trigger_spans_field,
            "all_args": arg_spans_field,
            "all_trigger_sentence_ids": trigger_sentence_id_field,
            "all_arg_sentence_ids": arg_sentence_id_field,
            "target_trigger_idx": trigger_idx_field,
            "target_arg_idx": arg_idx_field,
        }

        if roles_field:
            fields["target_roles"] = roles_field

        return Instance(fields)
예제 #6
0
    def text_to_instance(
            self,  # type: ignore
            sentences: List[List[str]],
            sentence_start_offsets: List[int],
            pred_arg_info: Optional[Dict[Tuple[int, int],
                                         List[Tuple[str, Tuple[int,
                                                               int]]]]] = None,
            genre: Optional[str] = None,
            document_id: Optional[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        max_sent_len = max(len(s) for s in sentences)
        padded_sentences = [[
            self._normalize_word(sentence[i]) if i < len(sentence) else "UNK"
            for i in range(max_sent_len)
        ] for sentence in sentences]
        flattened_sentences = [
            word for sentence in padded_sentences for word in sentence
        ]

        text_lens = [len(s) for s in sentences]
        text_lens_idx = [list(range(max_sent_len)) for _ in sentences]
        text_lens_mask = [[int(idx < sent_len) for idx in idxs]
                          for sent_len, idxs in zip(text_lens, text_lens_idx)]
        sentence_offsets = [0]
        for tl in text_lens:
            sentence_offsets.append(sentence_offsets[-1] + tl)
        sentence_offsets = sentence_offsets[:-1]
        sentence_offsets.append(
            float('inf'))  # sentinel/padding for finding sentence id from span

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        if genre:
            genre_field = LabelField(self._genres[genre], skip_indexing=True)
        else:
            genre_field = LabelField(0, skip_indexing=True)

        # pack up data as: List[str], List[List[SpanField]]
        #                       |               |
        #                    labels      (pred,arg) span pairs
        pred_arg_pairs = [] if pred_arg_info is not None else None
        preds = [] if pred_arg_info is not None else None
        args = [] if pred_arg_info is not None else None
        roles = [] if pred_arg_info is not None else None
        pred_sent_ids = {} if pred_arg_info is not None else None
        arg_sent_ids = {} if pred_arg_info is not None else None
        if pred_arg_info is not None:
            pred_arg_pairs = []
            seen_pred_spans = set()
            seen_arg_spans = set()
            for pred_span, argument_data in pred_arg_info.items():
                pred_sentence_id = self._get_sentence_id(
                    pred_span, sentence_offsets)
                pred_sent_ids[pred_span] = pred_sentence_id
                for (role, argument_span) in argument_data:
                    pred_arg_pairs.append([pred_span, argument_span, role])
                    arg_sentence_id = self._get_sentence_id(
                        argument_span, sentence_offsets)

                    # If not using gold, need to make sure the labels
                    # we care about exist in the enumerated spans
                    span_size_condition = True
                    if (not self._use_gold_triggers):
                        span_size_condition &= (pred_span[1] - pred_span[0] <
                                                self._max_trigger_span_width)
                    if (not self._use_gold_arguments):
                        span_size_condition &= (
                            argument_span[1] - argument_span[0] <
                            self._max_arg_span_width)
                    if span_size_condition:
                        roles.append(LabelField(role))
                        preds.append(pred_span)
                        args.append(argument_span)

                        arg_sent_ids[argument_span] = arg_sentence_id

        metadata: Dict[str, Any] = dict()
        metadata["sentences"] = sentences
        metadata["sentence_start_offsets"] = sentence_start_offsets
        metadata["text_lens"] = np.array(text_lens_mask)
        metadata["doc_id"] = document_id
        metadata["has_gold_targets"] = pred_arg_info is not None
        metadata["data_path"] = self._file_path
        metadata["annotation_kind"] = "SRL"

        # Create all preds and all args
        if self._use_gold_triggers:
            pred_spans_field, pred_idx_field, pred_sentence_id_field = self._get_unique_spans(
                preds, text_field, pred_sent_ids)
        else:
            pred_spans_field, pred_idx_field, pred_sentence_id_field = self._enumerate_spans(
                sentences, self._max_trigger_span_width, text_field, preds)
        if self._use_gold_arguments:
            arg_spans_field, arg_idx_field, arg_sentence_id_field = self._get_unique_spans(
                args, text_field, arg_sent_ids)
        else:
            arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans(
                sentences, self._max_arg_span_width, text_field, args)

        (metadata['triggers'], metadata['arguments'],
         metadata['roles']) = zip(*pred_arg_pairs)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "genre": genre_field,
            "metadata": metadata_field,
            "all_triggers": pred_spans_field,
            "all_args": arg_spans_field,
            "all_trigger_sentence_ids": pred_sentence_id_field,
            "all_arg_sentence_ids": arg_sentence_id_field,
            "target_roles": ListField(roles),
            "target_trigger_idx": pred_idx_field,
            "target_arg_idx": arg_idx_field,
        }

        return Instance(fields)