Exemplo n.º 1
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = to_bioul(
                chunk_tags) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_ner, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
Exemplo n.º 2
0
def from_bio_to_bioul(bio_fp: Path, bioul_fp: Path) -> None:
    '''
    :NOTE: This also removes lines that start with `#` and changes the 
           Sentiment labels with the following dictionary:
           `{'positive': 'POS', 'neutral': 'NEU', 'negative': 'NEG'}`

    :param bio_fp: File path to the data that is in CONLL like format: 
                   TOKEN LABEL\n where sentences are split by empty new lines.
                   The label format is in BIO = Beginning of, inside of, 
                   outside.
    :param bioul_fp: File path to save the data that is in `bio_fp` to 
                     this file but in BIOUL
    '''
    sentiment_tag_convert = {
        'positive': 'POS',
        'neutral': 'NEU',
        'negative': 'NEG'
    }
    with bioul_fp.open('w+') as bioul_file:
        with bio_fp.open('r') as bio_file:
            tokens = []
            labels = []
            for line in bio_file:
                if not line.strip():
                    labels = to_bioul(labels, encoding='BIO')
                    temp_labels = []
                    for label in labels:
                        if len(label.split('-')) == 1:
                            temp_labels.append(label)
                        else:
                            bio_tag, sentiment_tag = label.split('-')
                            sentiment_tag = sentiment_tag_convert[
                                sentiment_tag]
                            temp_labels.append(f'{bio_tag}-{sentiment_tag}')
                    labels = temp_labels
                    for token, label in zip(tokens, labels):
                        bioul_file.write(f'{token} {label}\n')
                    bioul_file.write('\n')
                    tokens = []
                    labels = []
                else:
                    if re.search(r'^#', line):
                        continue
                    token, label = line.split()
                    tokens.append(token)
                    labels.append(label)
            if tokens:
                labels = to_bioul(labels, encoding='BIO')
                temp_labels = []
                for label in labels:
                    if len(label.split('-')) == 1:
                        temp_labels.append(label)
                    else:
                        bio_tag, sentiment_tag = label.split('-')
                        sentiment_tag = sentiment_tag_convert[sentiment_tag]
                        temp_labels.append(f'{bio_tag}-{sentiment_tag}')
                labels = temp_labels
                for token, label in zip(tokens, labels):
                    bioul_file.write(f'{token} {label}\n')
Exemplo n.º 3
0
    def text_to_instance(self, # type: ignore
                         tokens: List[Token],
                         pos_tags: List[str] = None,
                         chunk_tags: List[str] = None,
                         ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = to_bioul(chunk_tags,
                                    encoding=self._original_coding_scheme) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError("Dataset reader was specified to use pos_tags as "
                                         "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError("Dataset reader was specified to use chunk tags as "
                                         "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError("Dataset reader was specified to use NER tags as "
                                         " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence,
                                                         self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(pos_tags, sequence,
                                                         self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence,
                                                         self.label_namespace)

        return Instance(instance_fields)
Exemplo n.º 4
0
    def text_to_instance(  # type: ignore
        self,
        tokens: List[Token],
        ner_tags: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_ner = (to_bioul(ner_tags,
                                  encoding=self._original_coding_scheme)
                         if ner_tags is not None else None)
        else:
            # the default IOB1
            coded_ner = ner_tags

        instance_fields["metadata"] = MetadataField({
            "words": [x.text for x in tokens],
            "tags":
            coded_ner
        })
        instance_fields["tags"] = SequenceLabelField(coded_ner, sequence,
                                                     self.label_namespace)

        return Instance(instance_fields)
Exemplo n.º 5
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None) -> Instance:

        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.label_encoding == "BIOUL":
            coded_chunks = to_bioul(chunk_tags,
                                    encoding=self._original_label_encoding
                                    ) if chunk_tags is not None else None
        else:
            # the default BIO
            coded_chunks = chunk_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")

        # Add "tag label" to instance
        if self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks,
                sequence,
                label_namespace=self._chunk_label_namespace)

        return Instance(instance_fields)
Exemplo n.º 6
0
    def text_to_instance(  # type: ignore
            self,
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = (to_bioul(chunk_tags,
                                     encoding=self._original_coding_scheme)
                            if chunk_tags is not None else None)
        else:
            # the default BIO
            coded_chunks = chunk_tags

        # Add "feature labels" to instance
        if "pos" in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields["pos_tags"] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if "chunk" in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields["chunk_tags"] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")

        # Add "tag label" to instance
        if self.tag_label == "pos" and pos_tags is not None:
            instance_fields["tags"] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == "chunk" and coded_chunks is not None:
            instance_fields["tags"] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
Exemplo n.º 7
0
 def text_to_instance(self, # type: ignore
                      tokens: List[Token],
                      ner_tags: List[str] = None) -> Instance:
     """
     We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
     """
     # pylint: disable=arguments-differ
     sequence = TextField(tokens, self._token_indexers)
     instance_fields: Dict[str, Field] = {'tokens': sequence}
     instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
     # Add "tag label" to instance
     if ner_tags is not None:
         if self._coding_scheme == "BIOUL":
             ner_tags = to_bioul(ner_tags, encoding="BIO")
         instance_fields['tags'] = SequenceLabelField(ner_tags, sequence)
     return Instance(instance_fields)
Exemplo n.º 8
0
def offsets_from_tags(
    doc: Doc,
    tags: List[str],
    label_encoding: Optional[str] = "BIOUL",
    only_token_spans: bool = False,
) -> List[Dict]:
    """Converts BIOUL or BIO tags to offsets

    Parameters
    ----------
    doc
        A spaCy Doc created with `text` and the backbone tokenizer
    tags
        A list of BIOUL or BIO tags
    label_encoding
        The label encoding of the tags: BIOUL or BIO
    only_token_spans
        If True, offsets contains only token index references. Default is False

    Returns
    -------
    offsets
        A list of dicts with start and end character/token index with respect to the doc and the span label:
        `{"start": int, "end": int, "start_token": int, "end_token": int, "label": str}`
    """
    # spacy's biluo_tags_to_offsets surprisingly does not check this ...
    if len(doc) != len(tags):
        raise ValueError(
            f"Number of tokens and tags must be the same, "
            f"but 'len({list(doc)}) != len({tags})"
        )

    if label_encoding == "BIO":
        tags = to_bioul(tags, encoding="BIO")

    offsets = []
    for start, end, label in biluo_tags_to_offsets(doc, tags):
        span = doc.char_span(start, end)
        data = {
            "start_token": span.start,
            "end_token": span.end,
            "label": label,
        }
        if not only_token_spans:
            data.update({"start": start, "end": end})
        offsets.append(data)
    return offsets
Exemplo n.º 9
0
    def text_to_instance(self, # type: ignore
                         tokens: List[Token],
                         chunk_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sentence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sentence}

        if chunk_tags is None:
            return Instance(instance_fields)
        chunk_tags = self.clip_chunks_by_max_length(chunk_tags)
        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL" and self._original_coding_scheme == "BIO":
            chunk_tags = to_bioul(chunk_tags, encoding=self._original_coding_scheme)

        # We want to treat O also as a valid span label, which is usually ignored.
        # However, each O span needs to be of length 1, since there is no reason to
        # combine tokens with O tags as a span, hence replacing O with U-O.
        chunk_tags = ['U-O' if tag == 'O' else tag for tag in chunk_tags]
        tags, namespace = self.convert_bioul_to_segmental(chunk_tags)
        instance_fields["tags"] = SequenceLabelField(tags, sentence, namespace)

        seg_starts = []
        seg_ends = []
        seg_map = []

        seg_count = 0
        for i, tag in enumerate(chunk_tags):
            if tag.startswith('B-') or tag.startswith('U-'):
                start = i
                seg_starts.append(IndexField(start, sentence))
            if tag.startswith('L-') or tag.startswith('U-'):
                end = i
                assert end - start < self._max_span_width
                seg_ends.append(IndexField(end, sentence))
                seg_map += [
                    IndexField(seg_count, instance_fields["tags"]) for _ in range(start, end+1)]
                seg_count += 1

        instance_fields['seg_ends'] = ListField(seg_ends)
        instance_fields['seg_starts'] = ListField(seg_starts)
        instance_fields['seg_map'] = ListField(seg_map)

        return Instance(instance_fields)
Exemplo n.º 10
0
    def text_to_instance(
            self,  # type: ignore
            filename: str,
            tokens: List[Token],
            ner_tags: List[str] = None,
            weights: List[float] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {
            'tokens': sequence,
            "metadata": MetadataField({"words": [x.text for x in tokens]})
        }
        if weights is None:
            weights = [1.0] * len(tokens)
        weight = weights[0]

        instance_fields["dataset"] = MetadataField(filename)

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme
                                 ) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_ner, sequence, self.label_namespace)

        return Instance(instance_fields)
Exemplo n.º 11
0
 def text_to_instance(
         self,  # type: ignore
         tokens,
         ner_tags=None):
     u"""
     We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
     """
     # pylint: disable=arguments-differ
     sequence = TextField(tokens, self._token_indexers)
     instance_fields = {u'tokens': sequence}
     instance_fields[u"metadata"] = MetadataField(
         {u"words": [x.text for x in tokens]})
     # Add "tag label" to instance
     if ner_tags is not None:
         if self._coding_scheme == u"BIOUL":
             ner_tags = to_bioul(ner_tags, encoding=u"BIO")
         instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
     return Instance(instance_fields)
Exemplo n.º 12
0
    def text_to_instance(
        self,  # type: ignore
        tokens: List[Token],
        ner_tags: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == "BIOUL":
                ner_tags = to_bioul(ner_tags, encoding="BIO")
            instance_fields["tags"] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields)
Exemplo n.º 13
0
    def text_to_instance(self,
                         tokens: List[Token],
                         tags: Optional[List[str]] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer 
        in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        # Metadata field
        metadata_dict = {"words": [x.text for x in tokens]}
        instance_fields["metadata"] = MetadataField(metadata_dict)

        if tags is not None:
            if self.coding_scheme == "BIOUL":
                tags = to_bioul(tag_sequence=tags,
                                encoding=self._original_coding_scheme)
            instance_fields['tags'] = SequenceLabelField(
                tags, sequence, self.label_namespace)

        return Instance(instance_fields)
Exemplo n.º 14
0
    def text_to_instance(self, # type: ignore
                         tokens: List[Token],
                         ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}

        def _remove_BI(_one_tag):
            if _one_tag == 'O':
                return _one_tag
            else:
                return _one_tag[2:]
        
        if self.coding_scheme == "BIOUL":
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_ner = ner_tags

        # TODO:
        # ner_tags -> spans of NE
        # return something like spans, span_labels ("O" if span not in golden_spans, "PER", "LOC"... otherwise)
        spans: List[Field] = []
        span_labels: List[str] = []
            
        gold_spans: List[Field] = []
        gold_span_labels: List[str] = []

        assert len(ner_tags) == len(tokens), "sentence:%s but ner_tags:%s"%(str(tokens), str(ner_tags))
        ner_gold_spans = _extract_spans(ner_tags) # ner_gold_spans: Dict[tuple(startid, endid), str(entity_type)]
        for start, end in enumerate_spans(ner_tags, offset=0, max_span_width=self._max_span_width):
            span_labels.append(ner_gold_spans.get((start, end), 'O'))
            spans.append(SpanField(start, end, sequence))
            pass
        
        _dict_gold_spans = {}
        for ky, val in ner_gold_spans.items():
            gold_span_labels.append(val)
            gold_spans.append(SpanField(ky[0], ky[1], sequence))
            if val != 'O':
                _dict_gold_spans[ky] = val
            pass
        
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens] ,
                                                    "gold_spans": _dict_gold_spans})
        
        assert len(spans) == len(span_labels), "span length not equal to span label length..."
        span_field = ListField(spans) # a list of (start, end) tuples...
        
        # contains all possible spans and their tags
        instance_fields['spans'] = span_field
        instance_fields['span_labels'] = SequenceLabelField(span_labels, span_field, "span_tags")
        
        # only contain gold_spans and their tags
        # e.g. (0,0,O), (1,1,O), (2,3,PER), (4,4,O) for 'I am Donald Trump .'
        gold_span_field = ListField(gold_spans)
        instance_fields['gold_spans'] = gold_span_field
        instance_fields['gold_span_labels'] = SequenceLabelField(gold_span_labels, 
                                                                 gold_span_field, "span_tags")


        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence,
                                                         'token_tags')
        return Instance(instance_fields)
Exemplo n.º 15
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme)
        elif self.coding_scheme == "B":
            # convert to binary mentions.
            coded_ner = ["O" if t == "O" else "U-MNT" for t in ner_tags]
        else:
            # the default IOB1
            coded_ner = ner_tags

        fix_coded_ner = []
        for t in coded_ner:
            if t[-1] == "-":
                fix_coded_ner.append("O")
            else:
                fix_coded_ner.append(t)
        coded_ner = fix_coded_ner

        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        words = [x.text for x in tokens]
        instance_fields: Dict[str, Field] = {
            'tokens':
            sequence,
            "metadata":
            MetadataField({
                "words": words,
                "orig_tags": coded_ner
            }),
            "donotuse":
            SequenceLabelField(coded_ner, sequence, label_namespace="labels")
        }

        tag_marginals = []
        for tag in coded_ner:
            if tag == "O":
                if self.strategy == "trust_labels":
                    # this strategy believes the tags completely
                    tag_marginal = np.zeros(len(self.alltags)) - 10000
                    tag_marginal[self.alltags[tag]] = 0
                    tag_marginals.append(ArrayField(tag_marginal))
                elif self.strategy == "uniform":
                    tag_marginal = np.zeros(len(self.alltags))
                    tag_marginals.append(ArrayField(tag_marginal))
                    # this strategy will express ignorance over all possibilities.
                else:
                    raise ConfigurationError("Unknown strategy: " +
                                             self.strategy)
            else:
                # we always fully trust the given labels.
                # this strategy believes the tags completely
                tag_marginal = np.zeros(len(self.alltags)) - 10000
                tag_marginal[self.alltags[tag]] = 0
                tag_marginals.append(ArrayField(tag_marginal))

        instance_fields['tags'] = ListField(tag_marginals)

        return Instance(instance_fields)
Exemplo n.º 16
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None,
            target_verb_lemma: str = None,
            target_verb_position: int = None,
            verb_sense: str = None,
            legal_args: List[str] = None,
            verb_annotation: List[str] = None,
            parse: str = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        words = [x.text for x in tokens]
        instance_fields["metadata"] = MetadataField({
            "words": words,  # used in ai2's srl model
            "pos_tags": pos_tags,
            "chunk_tags": chunk_tags,
            "ner_tags": chunk_tags,
            "target_verb_lemma": target_verb_lemma,
            "target_verb_position": target_verb_position,
            "verb_annotation": verb_annotation,
            "verb_sense": verb_sense,
            "legal_args": legal_args,
            "verb": target_verb_lemma,  # used in ai2's srl model
            "parse": parse  # for constraints for the dev set srl
        })

        # This is the position of the gold verb predicate
        # We may or may not use it (the model might predict the predicate), but the reader always sends it.
        # instance_fields["verb_pos"] = IndexField(index=target_verb_position, sequence_field=sequence)

        # TODO Allennlp uses SequenceFeatureField for a indicator vector of the verb position (Find this)
        # instance_fields["verb_indicator"] = SequenceFeatureField(index=target_verb_position, sequence_field=sequence)

        verb_indicator = np.zeros(len(tokens))
        verb_indicator[target_verb_position] = 1.0
        instance_fields["verb_indicator"] = ArrayField(array=verb_indicator)

        # everyone follows the default IOB2 == BIO format here
        coded_srl = get_bio_from_spans(verb_annotation,
                                       year=self.year,
                                       core_args_only=self.core_args_only)
        coded_chunks = chunk_tags
        coded_ner = ner_tags

        if self.coding_scheme == "BIOUL":
            # coded_srl = get_bio_from_spans(verb_annotation)
            coded_chunks = to_bioul(chunk_tags,
                                    encoding=self._original_coding_scheme
                                    ) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme
                                 ) if ner_tags is not None else None

        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'srl' and coded_srl is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_srl, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
Exemplo n.º 17
0
    def _read_dataset(self,
                      file_path: str,
                      count_only: bool = False,
                      keep_idx: Optional[Set[int]] = None):
        """
        Yield instances from the file_path.

        Parameters
        ----------
        file_path: str, required
            The path to the data file.
        count_only: bool, optional (default=``False``)
            If True, no instances are returned and instead a dummy object is
            returned. This is useful for quickly counting the number of instances
            in the data file, since creating instances is relatively expensive.
        keep_idx: Set[int], optional (default=``None``)
            If not None, only yield instances whose index is in this set.
        """
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            if count_only:
                logger.info("Counting instances in file at: %s", file_path)
            else:
                logger.info("Reading instances from lines in file at: %s", file_path)

            index = 0
            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    if count_only:
                        yield 1
                        continue
                    if keep_idx is not None and index not in keep_idx:
                        index += 1
                        continue
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, _, _, ner_tags = [list(field) for field in zip(*fields)]

                    # Contextualize the tokens if a Contextualizer was provided.
                    # TODO (nfliu): How can we make this batched?
                    # Would make contextualizers that use the GPU much faster.
                    if self._contextualizer:
                        token_representations = self._contextualizer([tokens])[0]
                    else:
                        token_representations = None

                    # Recode the labels if necessary.
                    if self._label_encoding == "BIOUL":
                        coded_ner = to_bioul(ner_tags) if ner_tags is not None else None
                    else:
                        coded_ner = ner_tags

                    yield self.text_to_instance(
                        tokens,
                        token_representations,
                        coded_ner)
                    index += 1