def text_to_instance(
        self,
        annotation_id: str,
        documents: Dict[str, List[str]],
        rationales: Dict[str, List[Tuple[int, int]]],
        query: str,
        label: str = None,
    ) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields = {}

        tokens = []
        is_evidence = []

        document_to_span_map = {}
        document_to_span_map_whole = {}

        docwords = documents[list(documents.keys())[0]]
        query = query.split("[sep]")
        query = [x.strip() for x in query]

        for docid, docwords in documents.items():
            document_to_span_map_whole[docid] = (len(tokens),
                                                 len(tokens) + len(docwords))
            tokens += [Token(word) for word in docwords]
            document_to_span_map[docid] = (len(tokens) - len(docwords),
                                           len(tokens))

            tokens.append(Token("[SEP]"))

            rationale = [0] * len(docwords)
            if docid in rationales:
                for s, e in rationales[docid]:
                    for i in range(s, e):
                        rationale[i] = 1

            is_evidence += rationale + [1]

        always_keep_mask = [
            1 if t.text.upper() == "[SEP]" else 0 for t in tokens
        ]

        fields["document"] = TextField(tokens, self._token_indexers)
        fields["rationale"] = SequenceLabelField(
            is_evidence,
            sequence_field=fields["document"],
            label_namespace="evidence_labels")
        fields["kept_tokens"] = SequenceLabelField(
            always_keep_mask,
            sequence_field=fields["document"],
            label_namespace="kept_token_labels")

        metadata = {
            "annotation_id": annotation_id,
            "tokens": tokens,
            "document_to_span_map": document_to_span_map,
            "convert_tokens_to_instance": self.convert_tokens_to_instance,
            "document_to_span_map_whole": document_to_span_map_whole,
            "always_keep_mask": np.array(always_keep_mask)
        }

        fields["metadata"] = MetadataField(metadata)
        fields["label"] = MetadataField({
            k: v
            for k, v in zip(["A", "B", "C", "D", "E", "Label"], query +
                            [label])
        })

        return Instance(fields)
 def text_to_instance_with_spans(self, tokens, verb_indicator, tags, spans):
     instance = super().text_to_instance(tokens, verb_indicator, tags)
     metadata_dict = instance.fields['metadata'].metadata
     metadata_dict['spans'] = spans
     instance.fields['metadata'] = MetadataField(metadata_dict)
     return Instance(instance.fields)
示例#3
0
文件: conll.py 项目: zzzshou/allennlp
    def text_to_instance(
        self,  # type: ignore
        sentences: List[List[str]],
        gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
    ) -> Instance:
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [
            self._normalize_word(word) for sentence in sentences
            for word in sentence
        ]

        metadata: Dict[str, Any] = {"original_text": flattened_sentences}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None

        sentence_offset = 0
        for sentence in sentences:
            for start, end in enumerate_spans(
                    sentence,
                    offset=sentence_offset,
                    max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field,
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
示例#4
0
    def text_to_instance(self, sentence: List[str],
                         ner_dict: Dict[Tuple[int, int], str], relation_dict,
                         doc_key: str, dataset: str, sentence_num: int,
                         groups: List[str], start_ix: int, end_ix: int,
                         dep_children_dict: Dict[Tuple[int, int],
                                                 List[Tuple[int, int]]]):

        sentence = [self._normalize_word(word) for word in sentence]

        text_field = TextField([Token(word) for word in sentence],
                               self._token_indexers)
        text_field_with_context = TextField([Token(word) for word in groups],
                                            self._token_indexers)

        # Put together the metadata.
        metadata = dict(sentence=sentence,
                        ner_dict=ner_dict,
                        relation_dict=relation_dict,
                        doc_key=doc_key,
                        dataset=dataset,
                        groups=groups,
                        start_ix=start_ix,
                        end_ix=end_ix,
                        sentence_num=sentence_num,
                        dep_children_dict=dep_children_dict)
        metadata_field = MetadataField(metadata)

        # Generate fields for text spans, ner labels
        spans = []
        span_ner_labels = []
        raw_spans = []

        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            span_ix = (start, end)
            span_ner_labels.append(ner_dict[span_ix])
            spans.append(SpanField(start, end, text_field))
            raw_spans.append(span_ix)

        span_field = ListField(spans)

        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_tokens)]
        dep_adjs = []
        dep_adjs_indices = []
        for token_pair in candidate_indices:
            dep_adj_label = dep_children_dict[token_pair]
            if dep_adj_label:
                dep_adjs_indices.append(token_pair)
                dep_adjs.append(dep_adj_label)

        ner_label_field = SequenceLabelField(span_ner_labels,
                                             span_field,
                                             label_namespace="ner_labels")

        n_spans = len(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        candidate_indices = [(i, j) for i in range(n_spans)
                             for j in range(n_spans)]

        relations = []
        relation_indices = []
        for i, j in candidate_indices:
            span_pair = (span_tuples[i], span_tuples[j])
            relation_label = relation_dict[span_pair]
            if relation_label:
                relation_indices.append((i, j))
                relations.append(relation_label)

        relation_label_field = AdjacencyField(
            indices=relation_indices,
            sequence_field=span_field,
            labels=relations,
            label_namespace="relation_labels")

        # Syntax
        dep_span_children_field = AdjacencyField(
            indices=dep_adjs_indices,
            sequence_field=text_field,
            labels=dep_adjs,
            label_namespace="dep_adj_labels")

        fields = dict(text=text_field_with_context,
                      spans=span_field,
                      ner_labels=ner_label_field,
                      relation_labels=relation_label_field,
                      metadata=metadata_field,
                      dep_span_children=dep_span_children_field)

        return Instance(fields)
    def text_to_instance(
        self,  # type: ignore
        question: str,
        table_lines: List[List[str]],
        target_values: List[str] = None,
        offline_search_output: List[str] = None,
    ) -> Instance:
        """
        Reads text inputs and makes an instance. We pass the ``table_lines`` to ``TableQuestionContext``, and that
        method accepts this field either as lines from CoreNLP processed tagged files that come with the dataset,
        or simply in a tsv format where each line corresponds to a row and the cells are tab-separated.

        Parameters
        ----------
        question : ``str``
            Input question
        table_lines : ``List[List[str]]``
            The table content optionally preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines``
            for the expected format.
        target_values : ``List[str]``, optional
            Target values for the denotations the logical forms should execute to. Not required for testing.
        offline_search_output : ``List[str]``, optional
            List of logical forms, produced by offline search. Not required during test.
        """
        tokenized_question = self._tokenizer.tokenize(question.lower())
        question_field = TextField(tokenized_question, self._question_token_indexers)
        metadata: Dict[str, Any] = {"question_tokens": [x.text for x in tokenized_question]}
        table_context = TableQuestionContext.read_from_lines(table_lines, tokenized_question)
        world = WikiTablesLanguage(table_context)
        world_field = MetadataField(world)
        # Note: Not passing any featre extractors when instantiating the field below. This will make
        # it use all the available extractors.
        table_field = KnowledgeGraphField(
            table_context.get_table_knowledge_graph(),
            tokenized_question,
            self._table_token_indexers,
            tokenizer=self._tokenizer,
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens,
        )
        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_productions():
            _, rule_right_side = production_rule.split(" -> ")
            is_global_rule = not world.is_instance_specific_entity(rule_right_side)
            field = ProductionRuleField(production_rule, is_global_rule=is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        fields = {
            "question": question_field,
            "metadata": MetadataField(metadata),
            "table": table_field,
            "world": world_field,
            "actions": action_field,
        }

        if target_values is not None:
            target_values_field = MetadataField(target_values)
            fields["target_values"] = target_values_field

        # We'll make each target action sequence a List[IndexField], where the index is into
        # the action list we made above.  We need to ignore the type here because mypy doesn't
        # like `action.rule` - it's hard to tell mypy that the ListField is made up of
        # ProductionRuleFields.
        action_map = {
            action.rule: i for i, action in enumerate(action_field.field_list)
        }  # type: ignore
        if offline_search_output:
            action_sequence_fields: List[Field] = []
            for logical_form in offline_search_output:
                try:
                    action_sequence = world.logical_form_to_action_sequence(logical_form)
                    index_fields: List[Field] = []
                    for production_rule in action_sequence:
                        index_fields.append(IndexField(action_map[production_rule], action_field))
                    action_sequence_fields.append(ListField(index_fields))
                except ParsingError as error:
                    logger.debug(f"Parsing error: {error.message}, skipping logical form")
                    logger.debug(f"Question was: {question}")
                    logger.debug(f"Logical form was: {logical_form}")
                    logger.debug(f"Table info was: {table_lines}")
                    continue
                except KeyError as error:
                    logger.debug(f"Missing production rule: {error.args}, skipping logical form")
                    logger.debug(f"Question was: {question}")
                    logger.debug(f"Table info was: {table_lines}")
                    logger.debug(f"Logical form was: {logical_form}")
                    continue
                except:  # noqa
                    logger.error(logical_form)
                    raise
                if len(action_sequence_fields) >= self._max_offline_logical_forms:
                    break

            if not action_sequence_fields:
                # This is not great, but we're only doing it when we're passed logical form
                # supervision, so we're expecting labeled logical forms, but we can't actually
                # produce the logical forms.  We should skip this instance.  Note that this affects
                # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the
                # full test data.
                return None
            fields["target_action_sequences"] = ListField(action_sequence_fields)
        if self._output_agendas:
            agenda_index_fields: List[Field] = []
            for agenda_string in world.get_agenda(conservative=True):
                agenda_index_fields.append(IndexField(action_map[agenda_string], action_field))
            if not agenda_index_fields:
                agenda_index_fields = [IndexField(-1, action_field)]
            fields["agenda"] = ListField(agenda_index_fields)
        return Instance(fields)
示例#6
0
    def text_to_instance(self, sentence: List[str],
                         ner_dict: Dict[Tuple[int, int], str], relation_dict,
                         cluster_dict, trigger_dict, argument_dict,
                         doc_key: str, dataset: str, sentence_num: int,
                         groups: List[str], start_ix: int, end_ix: int):
        """
        TODO(dwadden) document me.
        """

        sentence = [self._normalize_word(word) for word in sentence]

        text_field = TextField([Token(word) for word in sentence],
                               self._token_indexers)
        text_field_with_context = TextField([Token(word) for word in groups],
                                            self._token_indexers)

        # Put together the metadata.
        metadata = dict(sentence=sentence,
                        ner_dict=ner_dict,
                        relation_dict=relation_dict,
                        cluster_dict=cluster_dict,
                        trigger_dict=trigger_dict,
                        argument_dict=argument_dict,
                        doc_key=doc_key,
                        dataset=dataset,
                        groups=groups,
                        start_ix=start_ix,
                        end_ix=end_ix,
                        sentence_num=sentence_num)
        metadata_field = MetadataField(metadata)

        # Trigger labels. One label per token in the input.
        token_trigger_labels = []
        for i in range(len(text_field)):
            token_trigger_labels.append(trigger_dict[i])

        trigger_label_field = SequenceLabelField(
            token_trigger_labels, text_field, label_namespace="trigger_labels")

        # Generate fields for text spans, ner labels, coref labels.
        spans = []
        span_ner_labels = []
        span_coref_labels = []
        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            span_ix = (start, end)
            span_ner_labels.append(ner_dict[span_ix])
            span_coref_labels.append(cluster_dict[span_ix])
            spans.append(SpanField(start, end, text_field))

        span_field = ListField(spans)
        ner_label_field = SequenceLabelField(span_ner_labels,
                                             span_field,
                                             label_namespace="ner_labels")
        coref_label_field = SequenceLabelField(span_coref_labels,
                                               span_field,
                                               label_namespace="coref_labels")

        # Generate labels for relations and arguments. Only store non-null values.
        # For the arguments, by convention the first span specifies the trigger, and the second
        # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs
        # for the event arguments field, but AllenNLP doesn't make it possible to express
        # adjacencies between two different sequences.
        n_spans = len(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        candidate_indices = [(i, j) for i in range(n_spans)
                             for j in range(n_spans)]

        relations = []
        relation_indices = []
        for i, j in candidate_indices:
            span_pair = (span_tuples[i], span_tuples[j])
            relation_label = relation_dict[span_pair]
            if relation_label:
                relation_indices.append((i, j))
                relations.append(relation_label)

        relation_label_field = AdjacencyField(
            indices=relation_indices,
            sequence_field=span_field,
            labels=relations,
            label_namespace="relation_labels")

        arguments = []
        argument_indices = []
        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_spans)]
        for i, j in candidate_indices:
            token_span_pair = (i, span_tuples[j])
            argument_label = argument_dict[token_span_pair]
            if argument_label:
                argument_indices.append((i, j))
                arguments.append(argument_label)

        argument_label_field = AdjacencyFieldAssym(
            indices=argument_indices,
            row_field=text_field,
            col_field=span_field,
            labels=arguments,
            label_namespace="argument_labels")

        # Pull it  all together.
        fields = dict(text=text_field_with_context,
                      spans=span_field,
                      ner_labels=ner_label_field,
                      coref_labels=coref_label_field,
                      trigger_labels=trigger_label_field,
                      argument_labels=argument_label_field,
                      relation_labels=relation_label_field,
                      metadata=metadata_field)

        return Instance(fields)
示例#7
0
    def text_to_instance(
            self,  # type: ignore
            utterances: List[str],
            sql_query_labels: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        utterances: ``List[str]``, required.
            List of utterances in the interaction, the last element is the current utterance.
        sql_query_labels: ``List[str]``, optional
            The SQL queries that are given as labels during training or validation.
        """
        utterance = utterances[-1]
        action_sequence: List[str] = []

        if not utterance:
            return None

        world = AtisWorld(utterances=utterances,
                          database_file=self._database_file)

        if sql_query_labels:
            # If there are multiple sql queries given as labels, we use the shortest
            # one for training.
            sql_query = min(sql_query_labels, key=len)
            try:
                action_sequence = world.get_action_sequence(sql_query)
            except ParseError:
                logger.debug(f'Parsing error')

        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
        utterance_field = TextField(tokenized_utterance, self._token_indexers)

        production_rule_fields: List[Field] = []

        for production_rule in world.all_possible_actions():
            nonterminal, _ = production_rule.split(' ->')
            # The whitespaces are not semantically meaningful, so we filter them out.
            production_rule = ' '.join([
                token for token in production_rule.split(' ') if token != 'ws'
            ])
            field = ProductionRuleField(production_rule,
                                        self._is_global_rule(nonterminal))
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)
        action_map = {
            action.rule: i  # type: ignore
            for i, action in enumerate(action_field.field_list)
        }
        index_fields: List[Field] = []
        world_field = MetadataField(world)
        fields = {
            'utterance': utterance_field,
            'actions': action_field,
            'world': world_field,
            'linking_scores': ArrayField(world.linking_scores)
        }

        if sql_query_labels != None:
            fields['sql_queries'] = MetadataField(sql_query_labels)
            if action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(
                        IndexField(action_map[production_rule], action_field))

                action_sequence_field = ListField(index_fields)
                fields['target_action_sequence'] = action_sequence_field
            else:
                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
                return None

        return Instance(fields)
示例#8
0
    def text_to_instance(self, tokens: str, tags: str, heads: str, deps: str, metadata: Dict[str, str], label: str = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_tokens = self._tokenizer.tokenize(tokens)
        tokens_field = TextField(tokenized_tokens, self._token_indexers)
        tokenized_tags = self._tagizer.tokenize(tags)
        tags_field = TextField(tokenized_tags, self._tag_indexers)

        tokenized_heads = self._headizer.tokenize(heads)
        heads_field = TextField(tokenized_heads, self._head_indexers)

        tokenized_deps = self._depizer.tokenize(deps)
        deps_field = TextField(tokenized_deps, self._dep_indexers)

        fields = {'tokens': tokens_field, 'tags': tags_field, 'heads': heads_field, 'deps': deps_field, 'metadata': MetadataField(metadata)}
        # fields = {'tokens': tokens_field, 'metadata': MetadataField(metadata)}
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields)
示例#9
0
    def text_to_instance(
            self,  # type: ignore
            question: str,
            table_lines: List[List[str]],
            target_values: List[str],
            offline_search_output: List[str] = None) -> Instance:
        """
        Reads text inputs and makes an instance. WikitableQuestions dataset provides tables as
        TSV files pre-tagged using CoreNLP, which we use for training.

        Parameters
        ----------
        question : ``str``
            Input question
        table_lines : ``List[List[str]]``
            The table content preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines``
            for the expected format.
        target_values : ``List[str]``
        offline_search_output : List[str], optional
            List of logical forms, produced by offline search. Not required during test.
        """
        # pylint: disable=arguments-differ
        tokenized_question = self._tokenizer.tokenize(question.lower())
        question_field = TextField(tokenized_question,
                                   self._question_token_indexers)
        # TODO(pradeep): We'll need a better way to input CoreNLP processed lines.
        table_context = TableQuestionContext.read_from_lines(
            table_lines, tokenized_question)
        target_values_field = MetadataField(target_values)
        world = WikiTablesVariableFreeWorld(table_context)
        world_field = MetadataField(world)
        # Note: Not passing any featre extractors when instantiating the field below. This will make
        # it use all the available extractors.
        table_field = KnowledgeGraphField(
            table_context.get_table_knowledge_graph(),
            tokenized_question,
            self._table_token_indexers,
            tokenizer=self._tokenizer,
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens)
        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_actions():
            _, rule_right_side = production_rule.split(' -> ')
            is_global_rule = not world.is_instance_specific_entity(
                rule_right_side)
            field = ProductionRuleField(production_rule,
                                        is_global_rule=is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        fields = {
            'question': question_field,
            'table': table_field,
            'world': world_field,
            'actions': action_field,
            'target_values': target_values_field
        }

        # We'll make each target action sequence a List[IndexField], where the index is into
        # the action list we made above.  We need to ignore the type here because mypy doesn't
        # like `action.rule` - it's hard to tell mypy that the ListField is made up of
        # ProductionRuleFields.
        action_map = {
            action.rule: i
            for i, action in enumerate(action_field.field_list)
        }  # type: ignore
        if offline_search_output:
            action_sequence_fields: List[Field] = []
            for logical_form in offline_search_output:
                try:
                    expression = world.parse_logical_form(logical_form)
                except ParsingError as error:
                    logger.debug(
                        f'Parsing error: {error.message}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Logical form was: {logical_form}')
                    logger.debug(f'Table info was: {table_lines}')
                    continue
                except:
                    logger.error(logical_form)
                    raise
                action_sequence = world.get_action_sequence(expression)
                try:
                    index_fields: List[Field] = []
                    for production_rule in action_sequence:
                        index_fields.append(
                            IndexField(action_map[production_rule],
                                       action_field))
                    action_sequence_fields.append(ListField(index_fields))
                except KeyError as error:
                    logger.debug(
                        f'Missing production rule: {error.args}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Table info was: {table_lines}')
                    logger.debug(f'Logical form was: {logical_form}')
                    continue
                if len(action_sequence_fields
                       ) >= self._max_offline_logical_forms:
                    break

            if not action_sequence_fields:
                # This is not great, but we're only doing it when we're passed logical form
                # supervision, so we're expecting labeled logical forms, but we can't actually
                # produce the logical forms.  We should skip this instance.  Note that this affects
                # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the
                # full test data.
                return None
            fields['target_action_sequences'] = ListField(
                action_sequence_fields)
        if self._output_agendas:
            agenda_index_fields: List[Field] = []
            for agenda_string in world.get_agenda(conservative=True):
                agenda_index_fields.append(
                    IndexField(action_map[agenda_string], action_field))
            if not agenda_index_fields:
                agenda_index_fields = [IndexField(-1, action_field)]
            fields['agenda'] = ListField(agenda_index_fields)
        return Instance(fields)
示例#10
0
def make_reading_comprehension_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        token_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        additional_metadata: Dict[str, Any] = None) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    Parameters
    ----------
    question_tokens : ``List[Token]``
        An already-tokenized question.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans : ``List[Tuple[int, int]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list because there might be several possible correct answer spans in the passage.
        Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
        annotations on the dev set; this will select the span that the most annotators gave as
        correct).
    answer_texts : ``List[str]``, optional
        All valid answer strings for the given question.  In SQuAD, e.g., the training set has
        exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
        possible answers, which are the aliases for the known correct entity.  This is put into the
        metadata for use with official evaluation scripts, but not used anywhere else.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    #import pdb; pdb.set_trace()
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]

    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields['passage'] = passage_field
    fields['question'] = TextField(question_tokens, token_indexers)
    metadata = {
        'original_passage': passage_text,
        'token_offsets': passage_offsets,
        'question_tokens': [token.text for token in question_tokens],
        'passage_tokens': [token.text for token in passage_tokens],
    }
    if answer_texts:
        metadata['answer_texts'] = answer_texts

    list_span_start = []
    list_span_end = []
    if token_spans:

        # There may be multiple answer annotations, so we pick the one that occurs the most.  This
        # only matters on the SQuAD dev set, and it means our computed metrics ("start_acc",
        # "end_acc", and "span_acc") aren't quite the same as the official metrics, which look at
        # all of the annotations.  This is why we have a separate official SQuAD metric calculation
        # (the "em" and "f1" metrics use the official script).
        for span_start, span_end in token_spans:
            list_span_start.append(IndexField(span_start, passage_field))
            list_span_end.append(IndexField(span_end, passage_field))

        fields['span_start'] = ListField(list_span_start)
        fields['span_end'] = ListField(list_span_end)

    #import pdb; pdb.set_trace()
    metadata.update(additional_metadata)
    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)
示例#11
0
    def text_to_instance(
        self,  # type: ignore
        sentence: List[Token],
        gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
    ) -> Instance:
        """
        # Parameters

        sentence : `List[Token]`, required.
            The already tokenised sentence to analyse.
        gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None)
            A list of all clusters in the sentence, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        # Returns

        An `Instance` containing the following `Fields`:
            text : `TextField`
                The text of the full sentence.
            spans : `ListField[SpanField]`
                A ListField containing the spans represented as `SpanFields`
                with respect to the sentence text.
            span_labels : `SequenceLabelField`, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a `SequenceLabelField`
                 with respect to the `spans `ListField`.
        """
        metadata: Dict[str, Any] = {"original_text": sentence}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField(sentence, self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None

        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)

            spans.append(SpanField(start, end, text_field))

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field,
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
示例#12
0
    def text_to_instance(
            self,  # type: ignore
            rule_text: str,
            question: str,
            scenario: str,
            history: List[Dict[str, str]],
            utterance_id: str = None,
            tree_id: str = None,
            source_url: str = None,
            answer: str = None,
            evidence: List[Dict[str, str]] = None) -> Optional[Instance]:
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        # For CopyNet Model
        source_string = rule_text + ' [SEP]'
        target_string = answer

        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        # tokenized_source.append(Token(END_SYMBOL)) '[SEP]' acts as end symbol
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        # For Bert model
        passage_text = rule_text + ' [SEP]'
        question_text = question
        question_text += ' @ss@ ' + scenario
        question_text += ' @hs@ '
        for follow_up_qna in history:
            question_text += '@qs@ '
            question_text += follow_up_qna['follow_up_question'] + ' '
            question_text += follow_up_qna['follow_up_answer'] + ' '
        question_text += '@he@'
        bert_input = passage_text + ' ' + question_text

        bert_input_tokens = self._bert_tokenizer.tokenize(bert_input)
        bert_input_tokens.insert(0, Token(START_SYMBOL))
        fields_dict['bert_input'] = TextField(bert_input_tokens,
                                              self._bert_token_indexers)
        meta_fields['passage_tokens'] = self._bert_tokenizer.tokenize(
            passage_text)

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))

            action = 'More' if answer not in ['Yes', 'No', 'Irrelevant'
                                              ] else answer
            fields_dict['label'] = LabelField(action)
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        meta_fields['rule_text'] = rule_text
        meta_fields['question'] = question
        meta_fields['scenario'] = scenario
        meta_fields['history'] = history
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
示例#13
0
    def text_to_instance(self, graph, do_print=False) -> Instance:
        """
        Does bulk of work converting a graph to an Instance of Fields 
        """
        # pylint: disable=arguments-differ

        fields: Dict[str, Field] = {}

        max_tgt_length = None if self.eval else 60
        d = DecompGraph(graph, drop_syntax=self.drop_syntax, order=self.order)
        list_data = d.get_list_data(bos=START_SYMBOL,
                                    eos=END_SYMBOL,
                                    bert_tokenizer=self._tokenizer,
                                    max_tgt_length=max_tgt_length,
                                    semantics_only=self.semantics_only)
        if list_data is None:
            return None

        if do_print:
            self.spot_check(graph, list_data)

        # These four fields are used for seq2seq model and target side self copy
        fields["source_tokens"] = TextField(
            tokens=[Token(x) for x in list_data["src_tokens"]],
            token_indexers=self._source_token_indexers)

        if list_data['src_token_ids'] is not None:
            fields['source_subtoken_ids'] = ArrayField(
                list_data['src_token_ids'])
            self._number_bert_ids += len(list_data['src_token_ids'])
            self._number_bert_oov_ids += len([
                bert_id for bert_id in list_data['src_token_ids']
                if bert_id == 100
            ])

        if list_data['src_token_subword_index'] is not None:
            fields['source_token_recovery_matrix'] = ArrayField(
                list_data['src_token_subword_index'])

        # Target-side input.
        # (exclude the last one <EOS>.)
        fields["target_tokens"] = TextField(
            tokens=[Token(x) for x in list_data["tgt_tokens"][:-1]],
            token_indexers=self._target_token_indexers)

        if len(list_data['tgt_tokens']) > 60:
            self.over_len += 1

        fields["source_pos_tags"] = SequenceLabelField(
            labels=list_data["src_pos_tags"],
            sequence_field=fields["source_tokens"],
            label_namespace="pos_tags")

        if list_data["tgt_pos_tags"] is not None:
            fields["target_pos_tags"] = SequenceLabelField(
                labels=list_data["tgt_pos_tags"][:-1],
                sequence_field=fields["target_tokens"],
                label_namespace="pos_tags")

        fields["target_node_indices"] = SequenceLabelField(
            labels=list_data["tgt_indices"][:-1],
            sequence_field=fields["target_tokens"],
            label_namespace="node_indices",
        )

        # Target-side output.
        # Include <BOS> here because we want it in the generation vocabulary such that
        # at the inference starting stage, <BOS> can be correctly initialized.
        fields["generation_outputs"] = TextField(
            tokens=[Token(x) for x in list_data["tgt_tokens_to_generate"]],
            token_indexers=self._generation_token_indexers)

        fields["target_copy_indices"] = SequenceLabelField(
            labels=list_data["tgt_copy_indices"],
            sequence_field=fields["generation_outputs"],
            label_namespace="target_copy_indices",
        )

        fields[
            "target_attention_map"] = AdjacencyField(  # TODO: replace it with ArrayField.
                indices=list_data["tgt_copy_map"],
                sequence_field=fields["generation_outputs"],
                padding_value=0)

        # These two fields for source copy

        fields["source_copy_indices"] = SequenceLabelField(
            labels=list_data["src_copy_indices"],
            sequence_field=fields["generation_outputs"],
            label_namespace="source_copy_indices",
        )

        fields[
            "source_attention_map"] = AdjacencyField(  # TODO: replace it with ArrayField.
                indices=list_data["src_copy_map"],
                sequence_field=TextField([
                    Token(x) for x in
                    list_data["src_copy_vocab"].get_special_tok_list() +
                    list_data["src_tokens"]
                ], None),
                padding_value=0)
        #print(list_data['src_copy_indices'])
        #print(list_data['src_copy_map'])

        #print(f'over textfield {[Token(x) for x in list_data["src_copy_vocab"].get_special_tok_list() + list_data["src_tokens"]]}')

        #print(fields["source_copy_indices"])
        #print(fields["source_attention_map"])
        #sys.exit()

        # These two fields are used in biaffine parser
        fields["edge_types"] = TextField(
            tokens=[Token(x) for x in list_data["head_tags"]],
            token_indexers=self._edge_type_indexers)

        fields["edge_heads"] = SequenceLabelField(
            labels=list_data["head_indices"],
            sequence_field=fields["edge_types"],
            label_namespace="edge_heads")

        if list_data.get('node_mask', None) is not None:
            # Valid nodes are 1; pads are 0.
            fields['valid_node_mask'] = ArrayField(list_data['node_mask'])

        if list_data.get('edge_mask', None) is not None:
            # A matrix of shape [num_nodes, num_nodes] where entry (i, j) is 1
            # if and only if (1) j < i and (2) j is not an antecedent of i.
            # TODO: try to remove the second constrain.
            fields['edge_head_mask'] = ArrayField(list_data['edge_mask'])

        # node attributes
        #print(f"tgt attr {len(list_data['tgt_attributes'])}")
        #print(list_data['tgt_attributes'])
        #print(f"target tokens {len(fields['target_tokens'])}")
        #print(fields['target_tokens'])

        fields["target_attributes"] = ContinuousLabelField(
            labels=list_data["tgt_attributes"][:-1],
            sequence_field=fields["target_tokens"],
            ontology=NODE_ONTOLOGY)

        # edge attributes
        fields["edge_attributes"] = ContinuousLabelField(
            labels=list_data["edge_attributes"][:-1],
            sequence_field=fields["target_tokens"],
            ontology=EDGE_ONTOLOGY)

        # this field is actually needed for scoring later
        fields["graph"] = MetadataField(list_data['arbor_graph'])

        # Metadata fields, good for debugging
        fields["src_tokens_str"] = MetadataField(list_data["src_tokens"])

        fields["tgt_tokens_str"] = MetadataField(
            list_data.get("tgt_tokens", []))

        fields["src_copy_vocab"] = MetadataField(list_data["src_copy_vocab"])

        fields["tag_lut"] = MetadataField(dict(pos=list_data["pos_tag_lut"]))

        fields["source_copy_invalid_ids"] = MetadataField(
            list_data['src_copy_invalid_ids'])

        fields["node_name_list"] = MetadataField(list_data['node_name_list'])
        fields["target_dynamic_vocab"] = MetadataField(dict())

        fields["instance_meta"] = MetadataField(
            dict(
                pos_tag_lut=list_data["pos_tag_lut"],
                source_dynamic_vocab=list_data["src_copy_vocab"],
                target_token_indexers=self._target_token_indexers,
            ))

        to_print_keys = ["target_attributes", "target_tokens"]
        to_print = {k: v for k, v in fields.items() if k in to_print_keys}

        return Instance(fields)
示例#14
0
文件: srl.py 项目: danieldeutsch/gcd
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None,
            target_verb_lemma: str = None,
            target_verb_position: int = None,
            verb_sense: str = None,
            legal_args: List[str] = None,
            verb_annotation: List[str] = None,
            parse: str = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        words = [x.text for x in tokens]
        instance_fields["metadata"] = MetadataField({
            "words": words,  # used in ai2's srl model
            "pos_tags": pos_tags,
            "chunk_tags": chunk_tags,
            "ner_tags": chunk_tags,
            "target_verb_lemma": target_verb_lemma,
            "target_verb_position": target_verb_position,
            "verb_annotation": verb_annotation,
            "verb_sense": verb_sense,
            "legal_args": legal_args,
            "verb": target_verb_lemma,  # used in ai2's srl model
            "parse": parse  # for constraints for the dev set srl
        })

        # This is the position of the gold verb predicate
        # We may or may not use it (the model might predict the predicate), but the reader always sends it.
        # instance_fields["verb_pos"] = IndexField(index=target_verb_position, sequence_field=sequence)

        # TODO Allennlp uses SequenceFeatureField for a indicator vector of the verb position (Find this)
        # instance_fields["verb_indicator"] = SequenceFeatureField(index=target_verb_position, sequence_field=sequence)

        verb_indicator = np.zeros(len(tokens))
        verb_indicator[target_verb_position] = 1.0
        instance_fields["verb_indicator"] = ArrayField(array=verb_indicator)

        # everyone follows the default IOB2 == BIO format here
        coded_srl = get_bio_from_spans(verb_annotation,
                                       year=self.year,
                                       core_args_only=self.core_args_only)
        coded_chunks = chunk_tags
        coded_ner = ner_tags

        if self.coding_scheme == "BIOUL":
            # coded_srl = get_bio_from_spans(verb_annotation)
            coded_chunks = to_bioul(chunk_tags,
                                    encoding=self._original_coding_scheme
                                    ) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme
                                 ) if ner_tags is not None else None

        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'srl' and coded_srl is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_srl, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
示例#15
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            predicate_indices: List[int],
            token_representations: FloatTensor = None,
            labels: List[float] = None):
        """
        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in the sentence to be encoded.
       predicate_indices: ``List[int]``, required.
            A List of int, where each item denotes the index of a
            token to predict a value for.
        token_representations: ``FloatTensor``, optional (default=``None``)
            Precomputed token representations to use in the instance. If ``None``,
            we use a ``Contextualizer`` provided to the dataset reader to calculate
            the token representations. Shape is (seq_len, representation_dim).
        labels: ``List[str]``, optional (default=``None``)
            The labels of the arcs. ``None`` indicates that labels are not
            provided.

        Returns
        -------
        An ``Instance`` containing the following fields:
            raw_tokens : ListField[MetadataField]
                The raw str tokens in the sequence. Each MetadataField stores the raw string
                of a single token.
            label_indices : ``SequenceArrayField``
                Array of shape (num_labels,) corresponding to the indices of tokens
                to predict a value for.
            token_representations: ``ArrayField``
                Contains the representation of the tokens.
            labels: ``SequenceArrayField``
                The labels corresponding each arc represented in token_indices.
        """
        fields: Dict[str, Field] = {}

        # Add raw_tokens to the field
        if self._include_raw_tokens:
            fields["raw_tokens"] = ListField(
                [MetadataField(token) for token in tokens])

        # Add label_indices to the field
        label_indices_field = SequenceArrayField(
            # Subtract 1 since original data is 1-indexed
            # Pad with -1 since 0 (usually mask token) is a valid label index
            np.array(predicate_indices, dtype="int64") - 1,
            padding_value=-1)
        fields["label_indices"] = label_indices_field

        if token_representations is None and self._contextualizer:
            # Contextualize the tokens
            token_representations = self._contextualizer([tokens])[0]

        # Add representations of the tokens at the arc indices to the field
        # If we don't have representations, use an empty numpy array.
        if token_representations is not None:
            fields["token_representations"] = ArrayField(
                token_representations.numpy())
        if labels:
            fields["labels"] = SequenceArrayField(
                np.array(labels, dtype="float32"))
        return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            premise: List[Tuple[str, float]],  # Important type information
            hypothesis: str,
            pid: str = None,
            label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        if self.shuffle_sentences:
            # Potential improvement. Shuffle the input sentences. Maybe close this at last several epoch.
            random.shuffle(premise)

        premise_prob_list = []
        premise_tokens_list = []

        for premise_sent, prob in premise:
            tokenized_cur_sent = self.bert_servant.tokenize(
                premise_sent, modify_from_corenlp=True)
            # cur_sent_ids = self.bert_servant.tokens_to_ids(tokenized_cur_sent)

            if self.max_l is not None:
                tokenized_cur_sent = tokenized_cur_sent[:self.
                                                        max_l]  # truncate max length (default 60)

            premise_tokens_list.extend(tokenized_cur_sent)
            prob_value = np.ones(
                (len(tokenized_cur_sent), 1), dtype=np.float32) * prob
            premise_prob_list.append(prob_value)

        premise_prob = np.concatenate(premise_prob_list, axis=0)
        # premise_tokens_id_list = self.bert_servant.tokens_to_ids(premise_tokens_list)

        hypothesis_tokens_list = self.bert_servant.tokenize(
            hypothesis, modify_from_corenlp=True)

        # print("WTF!!!, p", len(premise_tokens_list))
        # print("WTF!!!, h", len(hypothesis_tokens_list))

        if self.max_l is not None:
            hypothesis_tokens_list = hypothesis_tokens_list[:self.max_l]

        hypothesis_prob = np.ones((len(hypothesis_tokens_list), 1),
                                  dtype=np.float32)

        assert len(premise_tokens_list) == len(premise_prob)
        assert len(hypothesis_tokens_list) == len(hypothesis_prob)

        paired_tokens_sequence = ['[CLS]'] + premise_tokens_list + [
            '[SEP]'
        ] + hypothesis_tokens_list + ['[SEP]']
        token_type_ids = [0] * (2 + len(premise_tokens_list)) + [1] * (
            1 + len(hypothesis_tokens_list))

        paired_ids_seq = self.bert_servant.tokens_to_ids(
            paired_tokens_sequence)
        assert len(paired_ids_seq) == len(token_type_ids)
        fields['paired_sequence'] = BertIndexField(
            np.asarray(paired_ids_seq, dtype=np.int64))
        fields['paired_token_type_ids'] = BertIndexField(
            np.asarray(token_type_ids, dtype=np.int64))

        premise_span = (1, 1 + len(premise_tokens_list)
                        )  # End is exclusive (important for later use)
        hypothesis_span = (premise_span[1] + 1,
                           premise_span[1] + 1 + len(hypothesis_tokens_list))

        assert len(paired_ids_seq) == 1 + (premise_span[1] - premise_span[0]) + 1 + \
               (hypothesis_span[1] - hypothesis_span[0]) + 1

        fields['bert_premise_span'] = MetadataField(premise_span)
        fields['bert_hypothesis_span'] = MetadataField(hypothesis_span)

        fields['premise_probs'] = MetadataField(premise_prob)
        fields['hypothesis_probs'] = MetadataField(hypothesis_prob)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            words: List[str],
            lemmas: List[str] = None,
            lemma_rules: List[str] = None,
            upos_tags: List[str] = None,
            xpos_tags: List[str] = None,
            feats: List[str] = None,
            separate_feats: List[Dict[str, str]] = None,
            dependencies: List[Tuple[str, int]] = None,
            ids: List[str] = None,
            multiword_ids: List[str] = None,
            multiword_forms: List[str] = None,
            langs: List[str] = None) -> Instance:
        fields: Dict[str, Field] = {}

        if self.use_lang_ids:
            # use ent_type_ for lang_ids
            tokens = TextField(
                [Token(text=w, ent_type_=l) for w, l in zip(words, langs)],
                self._token_indexers)
        else:
            tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["tokens"] = tokens

        names = ["upos", "xpos", "feats", "lemmas", "langs"]
        all_tags = [upos_tags, xpos_tags, feats, lemma_rules, langs]
        for name, field in zip(names, all_tags):
            if field:
                fields[name] = SequenceLabelField(field,
                                                  tokens,
                                                  label_namespace=name)

        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField(
                [x[0] for x in dependencies],
                tokens,
                label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField(
                [int(x[1]) for x in dependencies],
                tokens,
                label_namespace="head_index_tags")

        if self.use_separate_feats:
            feature_seq = []
            for feat_set in separate_feats:
                dimensions = {
                    dimension.replace('[', '_').replace(']', '_'): "_"
                    for dimension in self.ud_feats_schema
                }

                if feat_set != "_":
                    for dimension in feat_set:
                        dimensions[dimension.replace('[', '_').replace(
                            ']', '_')] = feat_set[dimension]

                feature_seq.append(dimensions)

            for dimension in self.ud_feats_schema:
                d = dimension.replace('[', '_').replace(']', '_')
                labels = [f[d] for f in feature_seq]
                fields[d] = SequenceLabelField(labels,
                                               tokens,
                                               label_namespace=d)

        fields["metadata"] = MetadataField({
            "words": words,
            "upos_tags": upos_tags,
            "xpos_tags": xpos_tags,
            "feats": feats,
            "lemmas": lemmas,
            "lemma_rules": lemma_rules,
            "ids": ids,
            "multiword_ids": multiword_ids,
            "multiword_forms": multiword_forms,
            "langs": langs
        })

        return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            lemmas: List[str] = None,
            pos_tags: List[str] = None,
            arc_indices: List[Tuple[int, int]] = None,
            arc_tags: List[str] = None,
            gold_actions: List[str] = None,
            arc_descendants: List[str] = None,
            root_id: List[int] = None,
            meta_info: List[str] = None,
            tokens_range: List[Tuple[int, int]] = None,
            gold_mrps: List[str] = None,
            deprels: List[str] = None,
            lex_infos: List[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens],
                                self._token_indexers)

        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}

        if arc_indices is not None and arc_tags is not None:
            meta_dict["arc_indices"] = arc_indices
            meta_dict["arc_tags"] = arc_tags
            fields["arc_tags"] = TextField([Token(a) for a in arc_tags],
                                           self._arc_tag_indexers)

        if gold_actions is not None:
            meta_dict["gold_actions"] = gold_actions
            fields["gold_actions"] = TextField(
                [Token(a) for a in gold_actions], self._action_indexers)

        if pos_tags is not None and self.pos_tags:
            fields["pos_tags"] = SequenceLabelField(pos_tags,
                                                    token_field,
                                                    label_namespace="pos")
        if arc_descendants is not None:
            meta_dict["arc_descendants"] = arc_descendants

        if root_id is not None:
            meta_dict["root_id"] = root_id[0]

        if meta_info is not None:
            meta_dict["meta_info"] = meta_info[0]

        if tokens_range is not None:
            meta_dict["tokens_range"] = tokens_range

        if gold_mrps is not None:
            meta_dict["gold_mrps"] = gold_mrps[0]
        if deprels is not None and self.deprels:
            fields["deprels"] = SequenceLabelField(deprels,
                                                   token_field,
                                                   label_namespace="deprels")

        if lex_infos is not None:
            bios, lexcat, ss, ss2 = zip(*tuple(lex_infos))
            if self.bios:
                fields["bios"] = SequenceLabelField(bios,
                                                    token_field,
                                                    label_namespace="bios")
            if self.lexcat:
                fields["lexcat"] = SequenceLabelField(lexcat,
                                                      token_field,
                                                      label_namespace="lexcat")
            if self.ss:
                fields["ss"] = SequenceLabelField(ss,
                                                  token_field,
                                                  label_namespace="ss")
            if self.ss2:
                fields["ss2"] = SequenceLabelField(ss2,
                                                   token_field,
                                                   label_namespace="ss2")

        fields["metadata"] = MetadataField(meta_dict)

        return Instance(fields)
示例#19
0
def make_reading_comprehension_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        token_spans_sent: List[Tuple[int, int]] = None,
        sent_labels: List[int] = None,
        answer_texts: List[str] = None,
        passage_offsets: List[Tuple] = None,
        evd_possible_chains: List[List[int]] = None,
        ans_sent_idxs: List[int] = None,
        article_id: str = None,
        para_limit: int = 2250) -> Instance:
    """
    Parameters
    ----------
    question_tokens : ``List[Token]``
        An already-tokenized question.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans : ``List[Tuple[int, int]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list because there might be several possible correct answer spans in the passage.
        Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
        annotations on the dev set; this will select the span that the most annotators gave as
        correct).
    answer_texts : ``List[str]``, optional
        All valid answer strings for the given question.  In SQuAD, e.g., the training set has
        exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
        possible answers, which are the aliases for the known correct entity.  This is put into the
        metadata for use with official evaluation scripts, but not used anywhere else.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    para_limit : ``int``, indicates the maximum length of a given article
    """
    fields: Dict[str, Field] = {}
    limit = len(
        passage_tokens) if para_limit > len(passage_tokens) else para_limit
    passage_tokens = passage_tokens[:limit]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    # sent_spans: list of [SpanFiled[sent_start, sent_end]], denote the start and end offset for each sentence
    # sent_labels_: list of [label], denote the whether a sentence is a supporting fact
    sent_spans, sent_labels_ = process_sent_spans(token_spans_sent,
                                                  sent_labels, passage_field,
                                                  para_limit)
    fields['sent_labels'] = ListField(sent_labels_)
    fields['sentence_spans'] = ListField(sent_spans)
    fields['passage'] = passage_field
    fields['question'] = TextField(question_tokens, token_indexers)

    # filter spans that exceed para limit so that the info in metadata is correct
    token_spans_sent = [(s, e if e < limit else limit - 1)
                        for s, e in token_spans_sent if s < limit]
    sent_labels = sent_labels[:len(token_spans_sent)]
    evd_possible_chains_ = process_evidence_chains(evd_possible_chains,
                                                   sent_labels_, fields)

    metadata = make_meta_data(passage_text, passage_offsets, question_tokens,
                              passage_tokens, token_spans_sent, sent_labels,
                              answer_texts, evd_possible_chains,
                              evd_possible_chains_, ans_sent_idxs, article_id)
    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)
示例#20
0
    def text_to_instance(
        self,  # type: ignore
        sentence: str,
        structured_representations: List[List[List[JsonDict]]],
        labels: List[str] = None,
        target_sequences: List[List[str]] = None,
        identifier: str = None,
    ) -> Instance:
        """
        Parameters
        ----------
        sentence : ``str``
            The query sentence.
        structured_representations : ``List[List[List[JsonDict]]]``
            A list of Json representations of all the worlds. See expected format in this class' docstring.
        labels : ``List[str]`` (optional)
            List of string representations of the labels (true or false) corresponding to the
            ``structured_representations``. Not required while testing.
        target_sequences : ``List[List[str]]`` (optional)
            List of target action sequences for each element which lead to the correct denotation in
            worlds corresponding to the structured representations.
        identifier : ``str`` (optional)
            The identifier from the dataset if available.
        """
        worlds = []
        for structured_representation in structured_representations:
            boxes = {
                Box(object_list, box_id)
                for box_id, object_list in enumerate(structured_representation)
            }
            worlds.append(NlvrLanguage(boxes))
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers)
        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change
        # later.
        for production_rule in worlds[0].all_possible_productions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)
        worlds_field = ListField([MetadataField(world) for world in worlds])
        metadata: Dict[str, Any] = {"sentence_tokens": [x.text for x in tokenized_sentence]}
        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "worlds": worlds_field,
            "actions": action_field,
            "metadata": MetadataField(metadata),
        }
        if identifier is not None:
            fields["identifier"] = MetadataField(identifier)
        # Depending on the type of supervision used for training the parser, we may want either
        # target action sequences or an agenda in our instance. We check if target sequences are
        # provided, and include them if they are. If not, we'll get an agenda for the sentence, and
        # include that in the instance.
        if target_sequences:
            action_sequence_fields: List[Field] = []
            for target_sequence in target_sequences:
                index_fields = ListField(
                    [
                        IndexField(instance_action_ids[action], action_field)
                        for action in target_sequence
                    ]
                )
                action_sequence_fields.append(index_fields)
                # TODO(pradeep): Define a max length for this field.
            fields["target_action_sequences"] = ListField(action_sequence_fields)
        elif self._output_agendas:
            # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
            # now, but may change later too.
            agenda = worlds[0].get_agenda_for_sentence(sentence)
            assert agenda, "No agenda found for sentence: %s" % sentence
            # agenda_field contains indices into actions.
            agenda_field = ListField(
                [IndexField(instance_action_ids[action], action_field) for action in agenda]
            )
            fields["agenda"] = agenda_field
        if labels:
            labels_field = ListField(
                [LabelField(label, label_namespace="denotations") for label in labels]
            )
            fields["labels"] = labels_field

        return Instance(fields)
示例#21
0
    def make_marginal_drop_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        number_tokens: List[Token],
        number_indices: List[int],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        answer_info: Dict[str, Any] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        question_offsets = [(token.idx, token.idx + len(token.text))
                            for token in question_tokens]

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        question_field = TextField(question_tokens, token_indexers)
        fields["passage"] = passage_field
        fields["question"] = question_field
        number_index_fields: List[Field] = [
            IndexField(index, passage_field) for index in number_indices
        ]
        fields["number_indices"] = ListField(number_index_fields)
        # This field is actually not required in the model,
        # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`.
        # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty
        # when we want to create a new empty field. That will lead to error.
        numbers_in_passage_field = TextField(number_tokens, token_indexers)
        metadata = {
            "original_passage": passage_text,
            "passage_token_offsets": passage_offsets,
            "question_token_offsets": question_offsets,
            "question_tokens": [token.text for token in question_tokens],
            "passage_tokens": [token.text for token in passage_tokens],
            "number_tokens": [token.text for token in number_tokens],
            "number_indices": number_indices,
        }
        if answer_info:
            metadata["answer_texts"] = answer_info["answer_texts"]

            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], passage_field)
                for span in answer_info["answer_passage_spans"]
            ]
            if not passage_span_fields:
                passage_span_fields.append(SpanField(-1, -1, passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_field)
                for span in answer_info["answer_question_spans"]
            ]
            if not question_span_fields:
                question_span_fields.append(SpanField(-1, -1, question_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            add_sub_signs_field: List[Field] = []
            for signs_for_one_add_sub_expression in answer_info[
                    "signs_for_add_sub_expressions"]:
                add_sub_signs_field.append(
                    SequenceLabelField(signs_for_one_add_sub_expression,
                                       numbers_in_passage_field))
            if not add_sub_signs_field:
                add_sub_signs_field.append(
                    SequenceLabelField([0] * len(number_tokens),
                                       numbers_in_passage_field))
            fields["answer_as_add_sub_expressions"] = ListField(
                add_sub_signs_field)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in answer_info["counts"]
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

        metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
示例#22
0
    def text_to_instance(
            self,
            item: Dict,
            entity_map: Dict,
            literals: Set,
            logical_forms: List = None) -> Instance:  # type: ignore
        qid = MetadataField(item['qid'])
        if item['qid'] in [2102902009000
                           ]:  # will exceed maximum length constraint
            return None

        if not self._use_sparql:
            if 's_expression' in item:
                target_string = item['s_expression']
            else:
                target_string = None
        else:
            if 'sparql_query' in item:
                target_string = item['sparql_query']
            else:
                target_string = None
        item['question'] = item['question'].replace(self._delimiter, ' ')
        # if self._training:
        if self._use_constrained_vocab and len(entity_map) > 0:
            if not self._training:
                constrained_vocab = self._get_constrained_vocab(
                    entity_map, literals)
            else:
                logical_form = item[
                    's_expression'] if not self._use_sparql else item[
                        'sparql_query']
                domains = item['domains'] if not self._gq1 else None
                constrained_vocab = self._get_constrained_vocab(
                    entity_map,
                    literals,
                    s_expression=logical_form,
                    domains=domains)
        elif len(entity_map) == 0 and self._training:
            vocab = set()
            vocab.update(self._schema_constants)
            vocab = list(vocab)
            random.shuffle(vocab)
            vocab = set(vocab[:200])
            if not self._use_sparql:
                vocab.update(
                    [x for x in self._target_tokenizer(item['s_expression'])])
            else:
                vocab.update(
                    [x for x in self._target_tokenizer(item['sparql_query'])])

            constrained_vocab = list(vocab)
        else:
            vocab = set()
            vocab.update(self._schema_constants)
            for eid in entity_map:
                vocab.add(eid)

            for l in literals:
                vocab.add(l)

            constrained_vocab = list(vocab)

        # schema_constants = constrained_vocab[:]
        # always fix the position of END_SYMBOL, START_SYMBOL in each constrained vocab,
        # because a consistent global shared end_index / start_index is needed by BeamSearch
        # Here we also fix the position for all other syntactic constants for the convenience
        # of embeddings computing
        for k, v in {
                k: v
                for k, v in sorted(self._global_syntax_constants_vocab.items(),
                                   key=lambda x: x[1])
        }.items():
            constrained_vocab.insert(v, k)

        schema_constants = constrained_vocab[:]

        # dividing the schema constants into num_constants_per_group every group
        concat_strings = [
            '' for _ in range(
                len(schema_constants) // self._num_constants_per_group + 1)
        ]
        for i in range(
                len(schema_constants) // self._num_constants_per_group + 1):
            if (i + 1) * self._num_constants_per_group <= len(
                    schema_constants):
                right_index = (i + 1) * self._num_constants_per_group
            else:
                right_index = len(schema_constants)
            for constant in schema_constants[
                    i * self._num_constants_per_group:right_index]:
                if constant in entity_map:  # to get the representation for a entity based on its friendly name
                    constant = entity_map[constant]
                if constant == '.':  # '.' in sparql means and
                    constant = 'and'
                concat_strings[i] += ' '.join(
                    re.split('\.|_', constant.lower())) + self._delimiter
        # handle sequence of length > 512 (dividing the schema constants into num_constants_per_group every group)
        # _source_tokenizer.tokenize will append the head [CLS] and ending [SEP] by itself
        tokenized_sources = [
            self._source_tokenizer.tokenize(item['question'] + '[SEP]' +
                                            concat_string)
            for concat_string in concat_strings
        ]

        end = []
        start = []
        for tokenized_source in tokenized_sources:
            flag = False
            for i, token in enumerate(tokenized_source):
                if flag and str(token) == self._delimiter:
                    end.append(i - 1)
                    start.append(i + 1)
                if str(token) == '[SEP]':
                    if not flag:
                        start.append(i + 1)
                    flag = True

            start = start[:-1]  # ignore the last ';'

        # unit test for concatenation
        # print(len(constrained_vocab), constrained_vocab)
        # for i, tokenized_source in enumerate(tokenized_sources):
        #     print(constrained_vocab[14 + 50*i: 14 + min(50*(i + 1), len(start))])
        #     print(start[50*i:min(50*(i + 1), len(start))])
        #     print(end[50*i:min(50*(i + 1), len(start))])
        #     print(tokenized_source)

        # source_field = ListField(
        # [TextField(tokenized_source, self._source_token_indexers) for tokenized_source in tokenized_sources])

        source_field = []
        for tokenized_source in tokenized_sources:
            chunk = TextField(tokenized_source, self._source_token_indexers)
            if len(chunk) > self._source_max_tokens:
                print(len(chunk), item['qid'], '!!!!!!!!!')
                exit(-1)
            source_field.append(chunk)
        source_field = ListField(source_field)

        # vocab_field = TextField([Token(x) for x in constrained_vocab], self._target_token_indexers)
        vocab_field = MetadataField(constrained_vocab)
        # if len(constrained_vocab) != 14 + len(start):
        if len(constrained_vocab) != len(start):
            print(entity_map)
        # assert len(constrained_vocab) == 14 + len(start)
        assert len(constrained_vocab) == len(start)

        instance_dict = {
            "source_tokens":
            source_field,  # The concatenation of utterance and schema constants
            # The start position for each schema constant in the concatenated input.
            "schema_start": MetadataField(start),
            # The end position ...
            "schema_end": MetadataField(end),
            "constrained_vocab": vocab_field,
            "ids": qid
        }

        # If you want to use F1 during training, uncomment this!
        # if 'answer' in item:
        #     answer = []
        #     for a in item['answer']:
        #         answer.append(a['answer_argument'])
        #     instance_dict['answer'] = MetadataField(answer)

        # print("num lfs: ", len(logical_forms))
        if not self._training and self._ranking_mode and logical_forms:
            lfs = []
            for lf in logical_forms:
                try:
                    lf_field = self._convert_target_to_indices(
                        lf, constrained_vocab, vocab_field)
                    lfs.append(lf_field)
                except Exception:
                    pass
            if len(lfs) == 0:
                return None
            candidates = ListField(lfs)
            instance_dict["candidates"] = candidates

            print(len(candidates))

        if target_string is not None:
            target_field = self._convert_target_to_indices(
                target_string, constrained_vocab, vocab_field)
            instance_dict[
                "target_tokens"] = target_field  # The id of each target token in constrained_vocab

        return Instance(instance_dict)
示例#23
0
    def text_to_instance(  # type: ignore
        self, utterances: List[str], sql_query_labels: List[str] = None
    ) -> Instance:
        """
        Parameters
        ----------
        utterances: ``List[str]``, required.
            List of utterances in the interaction, the last element is the current utterance.
        sql_query_labels: ``List[str]``, optional
            The SQL queries that are given as labels during training or validation.
        """
        if self._num_turns_to_concatenate:
            utterances[-1] = f" {END_OF_UTTERANCE_TOKEN} ".join(
                utterances[-self._num_turns_to_concatenate :]
            )

        utterance = utterances[-1]
        action_sequence: List[str] = []

        if not utterance:
            return None

        world = AtisWorld(utterances=utterances)

        if sql_query_labels:
            # If there are multiple sql queries given as labels, we use the shortest
            # one for training.
            sql_query = min(sql_query_labels, key=len)
            try:
                action_sequence = world.get_action_sequence(sql_query)
            except ParseError:
                action_sequence = []
                logger.debug("Parsing error")

        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
        utterance_field = TextField(tokenized_utterance, self._token_indexers)

        production_rule_fields: List[Field] = []

        for production_rule in world.all_possible_actions():
            nonterminal, _ = production_rule.split(" ->")
            # The whitespaces are not semantically meaningful, so we filter them out.
            production_rule = " ".join(
                [token for token in production_rule.split(" ") if token != "ws"]
            )
            field = ProductionRuleField(production_rule, self._is_global_rule(nonterminal))
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)
        action_map = {
            action.rule: i for i, action in enumerate(action_field.field_list)  # type: ignore
        }
        index_fields: List[Field] = []
        world_field = MetadataField(world)
        fields = {
            "utterance": utterance_field,
            "actions": action_field,
            "world": world_field,
            "linking_scores": ArrayField(world.linking_scores),
        }

        if sql_query_labels is not None:
            fields["sql_queries"] = MetadataField(sql_query_labels)
            if self._keep_if_unparseable or action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(IndexField(action_map[production_rule], action_field))
                if not action_sequence:
                    index_fields = [IndexField(-1, action_field)]
                action_sequence_field = ListField(index_fields)
                fields["target_action_sequence"] = action_sequence_field
            else:
                # If we are given a SQL query, but we are unable to parse it, and we do not specify explicitly
                # to keep it, then we will skip the it.
                return None

        return Instance(fields)
示例#24
0
    def text_to_instance(  # type: ignore
        self,
        tokens: List[Token],
        pos_tags: List[str] = None,
        chunk_tags: List[str] = None,
        ner_tags: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = (
                to_bioul(chunk_tags, encoding=self._original_coding_scheme)
                if chunk_tags is not None
                else None
            )
            coded_ner = (
                to_bioul(ner_tags, encoding=self._original_coding_scheme)
                if ner_tags is not None
                else None
            )
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if "pos" in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance."
                )
            instance_fields["pos_tags"] = SequenceLabelField(pos_tags, sequence, "pos_tags")
        if "chunk" in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance."
                )
            instance_fields["chunk_tags"] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
        if "ner" in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance."
                )
            instance_fields["ner_tags"] = SequenceLabelField(coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == "ner" and coded_ner is not None:
            instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace)
        elif self.tag_label == "pos" and pos_tags is not None:
            instance_fields["tags"] = SequenceLabelField(pos_tags, sequence, self.label_namespace)
        elif self.tag_label == "chunk" and coded_chunks is not None:
            instance_fields["tags"] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace
            )

        return Instance(instance_fields)
    def text_to_instance(
            self,  # type: ignore
            words: List[str],
            upos_tags: List[str],
            dependencies: List[Tuple[str, int]] = None,
            entities: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies : ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.
        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields: Dict[str, Field] = {}

        # if self.tokenizer is not None:
        #     tokens = self.tokenizer.tokenize(" ".join(words))
        # else:
        #     tokens = [Token(t) for t in words]

        characters = [c for word in words for c in word]

        characters = [Token(c) for c in characters]
        character_field = TextField(characters, self._token_indexers)

        spans = []
        start = 0
        for word in words:
            spans.append(
                SpanField(start, start + len(word) - 1, character_field))
            start += len(word)
        character_span_field = ListField(spans)
        fields["character_spans"] = character_span_field

        # text_field = TextField(tokens, self._token_indexers)
        fields["characters"] = character_field

        fields["pos_tags"] = SequenceLabelField(upos_tags,
                                                character_span_field,
                                                label_namespace="pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField(
                [x[0] for x in dependencies],
                character_span_field,
                label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField(
                [int(x[1]) for x in dependencies],
                character_span_field,
                label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({
            "words": words,
            "pos": upos_tags,
            "entities": entities
        })
        return Instance(fields)
示例#26
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])
        image_id = int(item['img_id'].split('-')[-1])
    

        with h5py.File(self.tag_feature_path, 'r') as h5:
            tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32)
            tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)

        with h5py.File(self.non_tag_feature_path, 'r') as h5:
            non_tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            non_tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)
            non_tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32)
        ###################################################################
        # Load questions and answers

        non_tag_question_annotid2detidx = self.non_tag_question_annotid2detidx[item['annot_id']]
        non_tag_answer_annotid2detidx = self.non_tag_answer_annotid2detidx[item['annot_id']]
        non_tag_rationale_annotid2detidx = self.non_tag_rationale_annotid2detidx[item['annot_id']]
        
        if self.mode == 'answer':
            question_annotid2detidx =  non_tag_question_annotid2detidx
            answer_annotid2detidx = non_tag_answer_annotid2detidx
        else:
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            q_len = len(item['question'])
            question_annotid2detidx = {}
            for k,v in non_tag_question_annotid2detidx.items():
                question_annotid2detidx[k] = v
            for k,v in non_tag_answer_annotid2detidx[conditioned_label].items():
                question_annotid2detidx[k+q_len] = v
            answer_annotid2detidx = non_tag_rationale_annotid2detidx

        if self.mode == 'rationale':
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)
        non_tag_dets2use, non_tag_old_det_to_new_ind = self._get_non_tag_det_to_use(question_annotid2detidx, answer_annotid2detidx, len(non_tag_boxes))

        if self.add_image_as_a_box:
            assert (len(dets2use) == np.max(old_det_to_new_ind))

        if self.add_image_as_a_box:
            non_tag_old_det_to_new_ind += 1

        # shift the non_tag detection idx, effectively as appending the non_tag detections to tag detections
        non_tag_old_det_to_new_ind[np.where(non_tag_old_det_to_new_ind)[0]] += len(dets2use)

        old_det_to_new_ind = old_det_to_new_ind.tolist()
        non_tag_old_det_to_new_ind = non_tag_old_det_to_new_ind.tolist()
        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[_my_fix_tokenization(
                item['question'],
                grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                non_tag_old_det_to_new_ind,
                question_annotid2detidx,
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1,
            ) for i in range(4)])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[_my_fix_tokenization(
            answer,
            grp_items[f'answer_{self.mode}{condition_key}{i}'],
            old_det_to_new_ind,
            item['objects'],
            non_tag_old_det_to_new_ind,
            answer_annotid2detidx[i],
            token_indexers=self.token_indexers,
            pad_ind=0 if self.add_image_as_a_box else -1,
        ) for i, answer in enumerate(answer_choices)])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number'],
                                                   'img_id':item['img_id']})

        ##node
        node_tokenized, node_tags = zip(*[_fix_word(
            i,
            index,
            item['annot_id'],
            self.h5fn_graph,
            self.h5fn_word,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['node'] = ListField(node_tokenized)

        ##visual concept
        visual_concept_tokenized, visual_concept_tags = zip(*[_fix_visual_concept(
            item['visual_concept'],
            item['visual_concept_num'],
            self.h5fn_word,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['visual_concept'] = ListField(visual_concept_tokenized)

        ##adj
        adj_result, adj_len = zip(*[_fix_adj(
            i,
            index,
            item['annot_id'],
            self.h5fn_graph,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['adjacent'] = ListField(adj_result)

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        #image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        #image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        #image = to_tensor_and_normalize(image)
        #c, h, w = image.shape
        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # Chop off the final dimension, that's the confidence
        tag_boxes = np.array(metadata['boxes'])[dets2use, :-1]
        if self.add_image_as_a_box:
            tag_boxes = np.row_stack(([1,1,700,700], tag_boxes)) # here we just use dummy box for background
        non_tag_boxes = non_tag_boxes[non_tag_dets2use]
        boxes = np.concatenate((tag_boxes, non_tag_boxes))

        if self.add_image_as_a_box:
            dets2use = dets2use + 1
            dets2use = np.insert(dets2use, 0, 0)

        tag_det_features = tag_features[dets2use]
        non_tag_det_features = non_tag_features[non_tag_dets2use]
        det_features = np.concatenate((tag_det_features, non_tag_det_features))

        instance_dict['det_features'] = ArrayField(det_features, padding_value=0)
        assert (det_features.shape[0] == boxes.shape[0])

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return None, instance
    def text_to_instance(
            self,  # type: ignore
            item_id: str,
            question: str,
            choice_list: List[str],
            answer_id: int = None,
            context: str = None,
            choice_context_list: List[str] = None,
            debug: int = -1) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        qa_fields = []
        segment_ids_fields = []
        qa_tokens_list = []
        binary_labels_fields = []
        for idx, choice in enumerate(choice_list):
            choice_context = context
            if choice_context_list is not None and choice_context_list[
                    idx] is not None:
                choice_context = choice_context_list[idx]
            qa_tokens, segment_ids = self.transformer_features_from_qa(
                question, choice, choice_context)

            qa_field = TextField(qa_tokens, self._token_indexers)
            segment_ids_field = SequenceLabelField(segment_ids, qa_field)
            binary_labels_field = LabelField(1 if answer_id == idx else 0,
                                             skip_indexing=True)
            qa_fields.append(qa_field)
            qa_tokens_list.append(qa_tokens)
            segment_ids_fields.append(segment_ids_field)
            binary_labels_fields.append(binary_labels_field)
            if debug > 0:
                logger.info(f"qa_tokens = {qa_tokens}")
                logger.info(f"segment_ids = {segment_ids}")

        fields['question'] = ListField(qa_fields)
        fields['segment_ids'] = ListField(segment_ids_fields)
        if answer_id is not None:
            fields['label'] = LabelField(answer_id, skip_indexing=True)
            fields['binary_labels'] = ListField(binary_labels_fields)

        metadata = {
            "id": item_id,
            "question_text": question,
            "choice_text_list": choice_list,
            "correct_answer_index": answer_id,
            "question_tokens_list": qa_tokens_list,
            "context": context,
            "choice_context_list": choice_context_list,
            "training": self._training
            # "question_tokens": [x.text for x in question_tokens],
            # "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
        }

        if debug > 0:
            logger.info(f"context = {context}")
            logger.info(f"choice_context_list = {choice_context_list}")
            logger.info(f"answer_id = {answer_id}")
            # logger.info(f"binary_labels = {fields['binary_labels']}")

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
示例#28
0
    def text_to_instance(
        self,
        sentence: str,
        identifier: str,
        image_ids: List[str],
        logical_form: str = None,
        attention_mode: int = None,
        box_annotation: Dict = None,
        denotation: str = None,
    ) -> Instance:
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence, self._token_indexers)

        world = VisualReasoningNlvr2Language(None, None, None, None, None,
                                             None)

        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        for production_rule in world.all_possible_productions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)

        boxes2 = []
        feats2 = []
        max_num_boxes = 0
        for key in image_ids:
            if self.img_data is not None:
                img_info = self.img_data[key]
            else:
                split_name = "train"
                if "dev" in key:
                    split_name = "valid"
                img_info = pickle.load(
                    open(
                        os.path.join(self._image_feat_cache_dir,
                                     split_name + "_obj36.tsv", key),
                        "rb",
                    ))
            boxes = img_info["boxes"].copy()
            feats = img_info["features"].copy()
            assert len(boxes) == len(feats)

            # Normalize the boxes (to 0 ~ 1)
            img_h, img_w = img_info["img_h"], img_info["img_w"]
            boxes[..., (0, 2)] /= img_w
            boxes[..., (1, 3)] /= img_h
            np.testing.assert_array_less(boxes, 1 + 1e-5)
            np.testing.assert_array_less(-boxes, 0 + 1e-5)

            if boxes.shape[0] > self._max_boxes:
                boxes = boxes[:self._max_boxes, :]
                feats = feats[:self._max_boxes, :]
            max_num_boxes = max(max_num_boxes, boxes.shape[0])
            boxes2.append(boxes)
            feats2.append(feats)
        boxes3 = [
            np.zeros((max_num_boxes, img_boxes.shape[-1]))
            for img_boxes in boxes2
        ]
        feats3 = [
            np.zeros((max_num_boxes, img_feats.shape[-1]))
            for img_feats in feats2
        ]
        for i in range(len(boxes2)):
            boxes3[i][:boxes2[i].shape[0], :] = boxes2[i]
            feats3[i][:feats2[i].shape[0], :] = feats2[i]
        boxes2 = boxes3
        feats2 = feats3
        feats = np.stack(feats2)
        boxes = np.stack(boxes2)
        metadata: Dict[str, Any] = {
            "utterance": sentence,
            "tokenized_utterance": tokenized_sentence,
            "identifier": identifier,
        }

        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "actions": action_field,
            "metadata": MetadataField(metadata),
            "image_id": MetadataField(identifier[:-2]),
            "visual_feat": ArrayField(feats),
            "pos": ArrayField(boxes),
        }
        if denotation is not None:
            fields["denotation"] = LabelField(denotation, skip_indexing=True)

        if logical_form:
            lisp_exp = annotation_to_lisp_exp(logical_form)
            target_sequence = world.logical_form_to_action_sequence(lisp_exp)
            index_field = [
                IndexField(instance_action_ids[action], action_field)
                for action in target_sequence
            ]
            fields["target_action_sequence"] = ListField(index_field)

            module_attention = annotation_to_module_attention(logical_form)
            target_attention = target_sequence_to_target_attn(
                target_sequence, module_attention)
            gold_question_attentions = self._assign_attention_to_tokens(
                target_attention, sentence, attention_mode)
            attn_index_field = [
                ListField(
                    [IndexField(att, sentence_field) for att in target_att])
                for target_att in gold_question_attentions
            ]
            fields["gold_question_attentions"] = ListField(attn_index_field)
            if box_annotation is None and len(self.box_annotations) > 0:
                fields["gold_box_annotations"] = MetadataField([])
            elif box_annotation is not None:
                modules = logical_form.split("\n")
                children = [[] for _ in modules]
                for j, module in enumerate(modules):
                    num_periods = len(module) - len(module.strip("."))
                    for k in range(j + 1, len(modules)):
                        num_periods_k = len(modules[k]) - len(
                            modules[k].strip("."))
                        if num_periods_k <= num_periods:
                            break
                        if num_periods_k == num_periods + 1:
                            children[j].append(k)
                for j in range(len(modules) - 1, -1, -1):
                    if modules[j].strip(".") == "in_left_image":
                        box_annotation[j] = {}
                        box_annotation[j]["module"] = modules[j].strip(".")
                        box_annotation[j][0] = box_annotation[j + 1][0]
                        box_annotation[j][1] = []
                        """for k in children[j]:
                            box_annotation[k][0] = box_annotation[k][0]
                            box_annotation[k][1] = []"""
                    elif modules[j].strip(".") == "in_right_image":
                        box_annotation[j] = {}
                        box_annotation[j]["module"] = modules[j].strip(".")
                        box_annotation[j][1] = box_annotation[j + 1][1]
                        box_annotation[j][0] = []
                    elif modules[j].strip(".") in {
                            "in_one_image", "in_other_image"
                    }:
                        box_annotation[j] = {}
                        box_annotation[j]["module"] = modules[j].strip(".")
                        box_annotation[j][0] = box_annotation[j + 1][0]
                        box_annotation[j][1] = box_annotation[j + 1][1]
                        """for k in children[j]:
                            box_annotation[k][0] = []
                            box_annotation[k][1] = box_annotation[k][1]"""
                keys = sorted(list(box_annotation.keys()))
                # print(identifier, keys)
                # print(box_annotation)
                # print(target_sequence)
                module_boxes = [(
                    mod,
                    box_annotation[mod]["module"],
                    [box_annotation[mod][0], box_annotation[mod][1]],
                ) for mod in keys]
                gold_boxes, gold_counts = target_sequence_to_target_boxes(
                    target_sequence, module_boxes, children)
                # print(identifier, target_sequence, module_boxes, gold_boxes)
                fields["gold_box_annotations"] = MetadataField(gold_boxes)
            metadata["gold"] = world.action_sequence_to_logical_form(
                target_sequence)
            fields["valid_target_sequence"] = ArrayField(
                np.array(1, dtype=np.int32))
        else:
            fields["target_action_sequence"] = ListField(
                [IndexField(0, action_field)])
            fields["gold_question_attentions"] = ListField(
                [ListField([IndexField(0, sentence_field)])])
            fields["valid_target_sequence"] = ArrayField(
                np.array(0, dtype=np.int32))
            if len(self.box_annotations) > 0:
                fields["gold_box_annotations"] = MetadataField([])
        return Instance(fields)
示例#29
0
    def text_to_instance(
        self,  # type: ignore
        tokens: List[Token],
        #pos_tags: List[str] = None,
        #chunk_tags: List[str] = None,
        ner_tags: List[str] = None
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}

        def _remove_BI(_one_tag):
            if _one_tag == 'O':
                return _one_tag
            else:
                return _one_tag[2:]

        if self.coding_scheme == "BIOUL":
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme
                                 ) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_ner = ner_tags

        # TODO:
        # ner_tags -> spans of NE
        # return something like spans, span_labels ("O" if span not in golden_spans, "PER", "LOC"... otherwise)
        spans: List[Field] = []
        span_labels: List[str] = []

        gold_spans: List[Field] = []
        gold_span_labels: List[str] = []

        assert len(ner_tags) == len(tokens), "sentence:%s but ner_tags:%s" % (
            str(tokens), str(ner_tags))
        ner_gold_spans = _extract_spans(
            ner_tags
        )  # ner_gold_spans: Dict[tuple(startid, endid), str(entity_type)]
        #print(self._max_span_width), exit(0)
        for start, end in enumerate_spans(ner_tags,
                                          offset=0,
                                          max_span_width=self._max_span_width):
            span_labels.append(ner_gold_spans.get((start, end), 'O'))
            spans.append(SpanField(start, end, sequence))
            pass

        _dict_gold_spans = {}
        for ky, val in ner_gold_spans.items():
            gold_span_labels.append(val)
            gold_spans.append(SpanField(ky[0], ky[1], sequence))
            if val != 'O':
                _dict_gold_spans[ky] = val
            pass

        instance_fields["metadata"] = MetadataField({
            "words": [x.text for x in tokens],
            "gold_spans":
            _dict_gold_spans
        })

        assert len(spans) == len(
            span_labels), "span length not equal to span label length..."
        span_field = ListField(spans)  # a list of (start, end) tuples...

        # contains all possible spans and there tags
        instance_fields['spans'] = span_field
        instance_fields['span_labels'] = SequenceLabelField(
            span_labels, span_field, "span_tags")

        # only contain gold_spans and there tags
        # e.g. (0,0,O), (1,1,O), (2,3,PER), (4,4,O) for 'I am Donald Trump .'
        gold_span_field = ListField(gold_spans)
        instance_fields['gold_spans'] = gold_span_field
        instance_fields['gold_span_labels'] = SequenceLabelField(
            gold_span_labels, gold_span_field, "span_tags")

        # Add "feature labels" to instance
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "token_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_ner, sequence, 'token_tags')

        return Instance(instance_fields)
示例#30
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[_fix_tokenization(
                item['question'],
                grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1
            ) for i in range(4)])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[_fix_tokenization(
            answer,
            grp_items[f'answer_{self.mode}{condition_key}{i}'],
            old_det_to_new_ind,
            item['objects'],
            token_indexers=self.token_indexers,
            pad_ind=0 if self.add_image_as_a_box else -1
        ) for i, answer in enumerate(answer_choices)])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number']})

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i])
                          for i in dets2use])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance