def instance_stream(
        file_path: str,
        tokenizer: PretrainedTransformerTokenizer,
        token_indexers: Dict[str, PretrainedTransformerIndexer],
        model_input_size: int = 512,
        normalize: bool = False) -> Generator[Instance, None, None]:
    file_names = os.listdir(file_path)
    for fn in file_names:
        if args.file_suffix not in fn:  # '.json', '.comm'
            continue

        if args.file_suffix == '.json':
            with open(os.path.join(file_path, fn)) as f:
                data = json.load(f)
            sentences = data['sentences']
            doc_key = data['doc_key']
        else:
            doc = CementDocument.from_communication_file(
                file_path=os.path.join(file_path, fn))
            sentences = list(doc.iterate_sentences())
            doc_key = str(doc.comm.id)

        if normalize:
            sentences = [[normalize_token(t) for t in sent]
                         for sent in sentences]

        tokenized_context_sentences: List[Tuple[List[List[Token]],
                                                List[Tuple[int, int]],
                                                List[str]]] = []
        for sent in sentences:
            tokenized_sent, offsets = tokenizer.intra_word_tokenize(sent)
            if len(tokenized_sent) > model_input_size:
                logger.info('Segmented long sentence.')
                tokenized_context_sentences.append(
                    (segment_long_sentence(tokenized_sent,
                                           model_input_size), offsets, sent))
            else:
                tokenized_context_sentences.append(
                    ([tokenized_sent], offsets, sent))

        for sent_id, (sent_token_list, sent_offsets,
                      sent) in enumerate(tokenized_context_sentences):
            for i, sent_tokens in enumerate(sent_token_list):
                # print(f'{[doc_key, str(sent_id)]}')
                yield construct_instance(tokens=sent_tokens,
                                         offsets=sent_offsets,
                                         key=[doc_key, str(sent_id)],
                                         segment=i,
                                         raw_sentence=sent,
                                         token_indexers=token_indexers)
示例#2
0
    def test_intra_word_tokenize(self):
        tokenizer = PretrainedTransformerTokenizer("bert-base-cased")

        sentence = "A, [MASK] AllenNLP sentence.".split(" ")
        expected_tokens = [
            "[CLS]",
            "A",
            ",",
            "[MASK]",
            "Allen",
            "##NL",
            "##P",
            "sentence",
            ".",
            "[SEP]",
        ]
        expected_offsets = [(1, 2), (3, 3), (4, 6), (7, 8)]
        tokens, offsets = tokenizer.intra_word_tokenize(sentence)
        tokens = [t.text for t in tokens]
        assert tokens == expected_tokens
        assert offsets == expected_offsets

        # sentence pair
        sentence_1 = "A, [MASK] AllenNLP sentence.".split(" ")
        sentence_2 = "A sentence.".split(" ")
        expected_tokens = [
            "[CLS]",
            "A",
            ",",
            "[MASK]",
            "Allen",
            "##NL",
            "##P",
            "sentence",
            ".",
            "[SEP]",
            "A",
            "sentence",
            ".",
            "[SEP]",
        ]
        expected_offsets_a = [(1, 2), (3, 3), (4, 6), (7, 8)]
        expected_offsets_b = [(10, 10), (11, 12)]
        tokens, offsets_a, offsets_b = tokenizer.intra_word_tokenize_sentence_pair(
            sentence_1, sentence_2)
        tokens = [t.text for t in tokens]
        assert tokens == expected_tokens
        assert offsets_a == expected_offsets_a
        assert offsets_b == expected_offsets_b
示例#3
0
    def test_intra_word_tokenize_whitespaces(self):
        tokenizer = PretrainedTransformerTokenizer("bert-base-cased")

        sentence = ["A,", " ", "[MASK]", "AllenNLP", "\u007f", "sentence."]
        expected_tokens = [
            "[CLS]",
            "A",
            ",",
            "[MASK]",
            "Allen",
            "##NL",
            "##P",
            "sentence",
            ".",
            "[SEP]",
        ]
        expected_offsets = [(1, 2), None, (3, 3), (4, 6), None, (7, 8)]
        tokens, offsets = tokenizer.intra_word_tokenize(sentence)
        tokens = [t.text for t in tokens]
        assert tokens == expected_tokens
        assert offsets == expected_offsets
示例#4
0
def make_coref_instance(
    sentences: List[List[str]],
    token_indexers: Dict[str, TokenIndexer],
    max_span_width: int,
    gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
    wordpiece_modeling_tokenizer: PretrainedTransformerTokenizer = None,
    max_sentences: int = None,
) -> Instance:
    """
    # Parameters

    sentences : `List[List[str]]`, required.
        A list of lists representing the tokenised words and sentences in the document.
    token_indexers : `Dict[str, TokenIndexer]`
        This is used to index the words in the document.  See :class:`TokenIndexer`.
    max_span_width : `int`, required.
        The maximum width of candidate spans to consider.
    gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None)
        A list of all clusters in the document, represented as word spans with absolute indices
        in the entire document. Each cluster contains some number of spans, which can be nested
        and overlap. If there are exact matches between clusters, they will be resolved
        using `_canonicalize_clusters`.
    wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = None)
        If not None, this dataset reader does subword tokenization using the supplied tokenizer
        and distribute the labels to the resulting wordpieces. All the modeling will be based on
        wordpieces. If this is set to `False` (default), the user is expected to use
        `PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`,
        and the modeling will be on the word-level.
    max_sentences: int, optional (default = None)
        The maximum number of sentences in each document to keep. By default keeps all sentences.

    # Returns

    An `Instance` containing the following `Fields`:
        text : `TextField`
            The text of the full document.
        spans : `ListField[SpanField]`
            A ListField containing the spans represented as `SpanFields`
            with respect to the document text.
        span_labels : `SequenceLabelField`, optional
            The id of the cluster which each possible span belongs to, or -1 if it does
                not belong to a cluster. As these labels have variable length (it depends on
                how many spans we are considering), we represent this a as a `SequenceLabelField`
                with respect to the `spans `ListField`.
    """
    if max_sentences is not None and len(sentences) > max_sentences:
        sentences = sentences[:max_sentences]
        total_length = sum(len(sentence) for sentence in sentences)

        if gold_clusters is not None:
            new_gold_clusters = []

            for cluster in gold_clusters:
                new_cluster = []
                for mention in cluster:
                    if mention[1] < total_length:
                        new_cluster.append(mention)
                if new_cluster:
                    new_gold_clusters.append(new_cluster)

            gold_clusters = new_gold_clusters

    flattened_sentences = [
        _normalize_word(word) for sentence in sentences for word in sentence
    ]

    if wordpiece_modeling_tokenizer is not None:
        flat_sentences_tokens, offsets = wordpiece_modeling_tokenizer.intra_word_tokenize(
            flattened_sentences)
        flattened_sentences = [t.text for t in flat_sentences_tokens]
    else:
        flat_sentences_tokens = [Token(word) for word in flattened_sentences]

    text_field = TextField(flat_sentences_tokens, token_indexers)

    cluster_dict = {}
    if gold_clusters is not None:
        gold_clusters = _canonicalize_clusters(gold_clusters)

        if wordpiece_modeling_tokenizer is not None:
            for cluster in gold_clusters:
                for mention_id, mention in enumerate(cluster):
                    start = offsets[mention[0]][0]
                    end = offsets[mention[1]][1]
                    cluster[mention_id] = (start, end)

        for cluster_id, cluster in enumerate(gold_clusters):
            for mention in cluster:
                cluster_dict[tuple(mention)] = cluster_id

    spans: List[Field] = []
    span_labels: Optional[
        List[int]] = [] if gold_clusters is not None else None

    sentence_offset = 0
    for sentence in sentences:
        for start, end in enumerate_spans(sentence,
                                          offset=sentence_offset,
                                          max_span_width=max_span_width):
            if wordpiece_modeling_tokenizer is not None:
                start = offsets[start][0]
                end = offsets[end][1]

                # `enumerate_spans` uses word-level width limit; here we apply it to wordpieces
                # We have to do this check here because we use a span width embedding that has
                # only `max_span_width` entries, and since we are doing wordpiece
                # modeling, the span width embedding operates on wordpiece lengths. So a check
                # here is necessary or else we wouldn't know how many entries there would be.
                if end - start + 1 > max_span_width:
                    continue
                # We also don't generate spans that contain special tokens
                if start < wordpiece_modeling_tokenizer.num_added_start_tokens:
                    continue
                if (end >= len(flat_sentences_tokens) -
                        wordpiece_modeling_tokenizer.num_added_end_tokens):
                    continue

            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)

            spans.append(SpanField(start, end, text_field))
        sentence_offset += len(sentence)

    span_field = ListField(spans)

    metadata: Dict[str, Any] = {"original_text": flattened_sentences}
    if gold_clusters is not None:
        metadata["clusters"] = gold_clusters
    metadata_field = MetadataField(metadata)

    fields: Dict[str, Field] = {
        "text": text_field,
        "spans": span_field,
        "metadata": metadata_field,
    }
    if span_labels is not None:
        fields["span_labels"] = SequenceLabelField(span_labels, span_field)

    return Instance(fields)
示例#5
0
def make_coref_instance(
    sentences: List[List[str]],
    token_indexers: Dict[str, TokenIndexer],
    max_span_width: int,
    document_id: Optional[str] = None,
    words: List[str] = None,
    gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
    srl_frames: Optional[List[Tuple[int, List[Tuple[int, int, str]]]]] = None,
    include_srl: bool = False,
    named_entities: Optional[List[str]] = None,
    named_entity_spans: Optional[List[Tuple[int, int, str]]] = None,
    include_ner: bool = False,
    include_coref: bool = True,
    wordpiece_modeling_tokenizer: PretrainedTransformerTokenizer = None,
    max_sentences: int = None,
    remove_singleton_clusters: bool = True,
    span_label_map: Dict[Tuple[int,int], str] = None,
    language: str = None,
    sentence_objects: List[OntonotesSentence] = None,
    parallel_sentences: List[List[str]] = None,
) -> Instance:

    """
    # Parameters

    sentences : `List[List[str]]`, required.
        A list of lists representing the tokenised words and sentences in the document.
    token_indexers : `Dict[str, TokenIndexer]`
        This is used to index the words in the document.  See :class:`TokenIndexer`.
    max_span_width : `int`, required.
        The maximum width of candidate spans to consider.
    gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = `None`)
        A list of all clusters in the document, represented as word spans with absolute indices
        in the entire document. Each cluster contains some number of spans, which can be nested
        and overlap. If there are exact matches between clusters, they will be resolved
        using `_canonicalize_clusters`.
    wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = `None`)
        If not None, this dataset reader does subword tokenization using the supplied tokenizer
        and distribute the labels to the resulting wordpieces. All the modeling will be based on
        wordpieces. If this is set to `False` (default), the user is expected to use
        `PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`,
        and the modeling will be on the word-level.
    max_sentences: `int`, optional (default = `None`)
        The maximum number of sentences in each document to keep. By default keeps all sentences.
    remove_singleton_clusters : `bool`, optional (default = `True`)
        Some datasets contain clusters that are singletons (i.e. no coreferents). This option allows
        the removal of them.

    # Returns

    An `Instance` containing the following `Fields`:

        text : `TextField`
            The text of the full document.
        spans : `ListField[SpanField]`
            A ListField containing the spans represented as `SpanFields`
            with respect to the document text.
        span_labels : `SequenceLabelField`, optional
            The id of the cluster which each possible span belongs to, or -1 if it does
                not belong to a cluster. As these labels have variable length (it depends on
                how many spans we are considering), we represent this a as a `SequenceLabelField`
                with respect to the spans `ListField`.
    """
    if max_sentences is not None and len(sentences) > max_sentences:
        sentences = sentences[:max_sentences]
        total_length = sum(len(sentence) for sentence in sentences)

        if gold_clusters is not None:
            new_gold_clusters = []

            for cluster in gold_clusters:
                new_cluster = []
                for mention in cluster:
                    if mention[1] < total_length:
                        new_cluster.append(mention)
                if new_cluster:
                    new_gold_clusters.append(new_cluster)

            gold_clusters = new_gold_clusters

    flattened_sentences = [_normalize_word(word) for sentence in sentences for word in sentence]
    sentences = [[_normalize_word(word) for word in sentence] for sentence in sentences]
    if parallel_sentences is not None:
        parallel_sentences = [[_normalize_word(word) for word in sentence] for sentence in parallel_sentences]
        flattened_parallel_sentences = [word for sentence in parallel_sentences for word in sentence]
    if words is not None:
        flattened_sentences = [_normalize_word(word) for word in words]
    if language is not None and language == "arabic":
        flattened_sentences = [clean_arabic_text(word.split("#")[0]) for word in flattened_sentences]
        sentences = [[clean_arabic_text(word.split("#")[0]) for word in sentence] for sentence in sentences]
        if parallel_sentences is not None:
            parallel_sentences = [[clean_arabic_text(word.split("#")[0]) for word in sentence] for sentence in parallel_sentences]
            flattened_parallel_sentences = [word for sentence in parallel_sentences for word in sentence]

    if wordpiece_modeling_tokenizer is not None:
        flat_sentences_tokens, offsets = wordpiece_modeling_tokenizer.intra_word_tokenize(
            flattened_sentences
        )
        flattened_sentences = [t.text for t in flat_sentences_tokens]
        if parallel_sentences is not None:
            flat_parallel_sentences_tokens, offsets = wordpiece_modeling_tokenizer.intra_word_tokenize(
                flattened_sentences
            )
    else:
        flat_sentences_tokens = [Token(word) for word in flattened_sentences]
        if parallel_sentences is not None:
            flat_parallel_sentences_tokens = [Token(word) for word in flattened_parallel_sentences]

    text_field = TextField(flat_sentences_tokens, token_indexers)

    cluster_dict = {}
    if gold_clusters is not None:
        gold_clusters = _canonicalize_clusters(gold_clusters)
        if remove_singleton_clusters:
            gold_clusters = [cluster for cluster in gold_clusters if len(cluster) > 1]

        if wordpiece_modeling_tokenizer is not None:
            for cluster in gold_clusters:
                for mention_id, mention in enumerate(cluster):
                    start = offsets[mention[0]][0]
                    end = offsets[mention[1]][1]
                    cluster[mention_id] = (start, end)

        for cluster_id, cluster in enumerate(gold_clusters):
            for mention in cluster:
                cluster_dict[tuple(mention)] = cluster_id

    spans: List[Field] = []
    span_index_map: Dict[Tuple[int, int], int] = {}
    token_same_sentence_spans: Dict[int, List[Tuple[int, int]]] = {}
    token_sentence_start_end_map: Dict[int, Tuple[int, int]] = {}
    sentence_index_span_map: Dict[int, Tuple[int, int]] = {}
    span_labels: Optional[List[Union[int,str]]] = [] if gold_clusters is not None else None

    sentence_offset = 0
    sentence_offsets = []
    for sent_index, sentence in enumerate(sentences):
        sentence_spans = []
        sentence_index_span_map[sent_index] = []
        for start, end in enumerate_spans(
            sentence, offset=sentence_offset, max_span_width=max_span_width
        ):
            if wordpiece_modeling_tokenizer is not None:
                start = offsets[start][0]
                end = offsets[end][1]

                # `enumerate_spans` uses word-level width limit; here we apply it to wordpieces
                # We have to do this check here because we use a span width embedding that has
                # only `max_span_width` entries, and since we are doing wordpiece
                # modeling, the span width embedding operates on wordpiece lengths. So a check
                # here is necessary or else we wouldn't know how many entries there would be.
                if end - start + 1 > max_span_width:
                    continue
                # We also don't generate spans that contain special tokens
                if start < len(wordpiece_modeling_tokenizer.single_sequence_start_tokens):
                    continue
                if end >= len(flat_sentences_tokens) - len(
                    wordpiece_modeling_tokenizer.single_sequence_end_tokens
                ):
                    continue

            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)
            if span_label_map is not None:
                if (start, end) in span_label_map:
                    span_labels[-1] = span_label_map[(start, end)]
                else:
                    span_labels[-1] = "O"

            if end <= len(flat_sentences_tokens)-1:
                span = (start, end)
                span_index_map[span] = len(spans)
                sentence_spans.append(len(spans))
                spans.append(SpanField(start, end, text_field))
                sentence_index_span_map[sent_index].append((start, end))
        for i in range(len(sentence)):
            token_same_sentence_spans[i+sentence_offset] = sentence_spans
            token_sentence_start_end_map[i+sentence_offset] = (sentence_offset, sentence_offset+len(sentence)-1)
        sentence_offsets.append(sentence_offset)
        sentence_offset += len(sentence)

    if len(spans) == 0:
        return None
    span_field = ListField(spans)

    metadata: Dict[str, Any] = {"original_text": flattened_sentences, "sentence_offsets": sentence_offsets, "sentences": sentences, "sentence_index_span_map": sentence_index_span_map, "span_index_map": span_index_map}
    if gold_clusters is not None:
        metadata["clusters"] = gold_clusters
    if language is not None:
        metadata["language"] = language
    if sentence_objects is not None:
        metadata["sentence_objects"] = sentence_objects
    metadata_field = MetadataField(metadata)

    fields: Dict[str, Field] = {
        "text": text_field,
        "spans": span_field,
        "metadata": metadata_field,
    }
    if span_labels is not None and include_coref:
        fields["span_labels"] = SequenceLabelField(span_labels, span_field, label_namespace="span_labels")
    if parallel_sentences is not None:
        fields["parallel_text"] = TextField(flat_parallel_sentences_tokens, token_indexers)
    if include_srl and srl_frames is not None:
        predicate_span_pairs = []
        pair_labels = []
        filtered_srl_frames = []
        for predicate_index, arguments in srl_frames:
            filtered_arguments = []
            covered_spans = set()
            for (start, end, arg_type) in arguments:
                if (start, end) in span_index_map and (start, end) not in covered_spans:
                    if start == predicate_index == end:
                        continue
                    predicate_span_pairs.append((predicate_index, span_index_map[(start, end)]))
                    pair_labels.append(arg_type)
                    filtered_arguments.append((start, end, arg_type))
                    covered_spans.add((start, end))
            arguments_without_predicate = [arg for arg in arguments if arg[-1] != "V"]
            if len(arguments_without_predicate) > 0:
                filtered_srl_frames.append((predicate_index, arguments_without_predicate))
            if len(set([arg[:2] for arg in arguments])) < len([arg[:2] for arg in arguments]):
                print(predicate_index, arguments)
                print(flattened_sentences)
            if len(filtered_srl_frames) < len(srl_frames) and predicate_index == srl_frames[-1][0]:
                print(predicate_index, arguments, filtered_arguments)
                print('B', srl_frames, filtered_srl_frames)
                print(flattened_sentences)
        # if len(predicate_span_pairs) > 0:
        fields["srl_labels"] = AsymmetricAdjacencyField(predicate_span_pairs, text_field, span_field, labels=pair_labels, label_namespace="srl_labels")
        srl_seq_label_fields = []
        srl_seq_labels = []
        srl_seq_indices = []
        srl_seq_words = []
        predicate_indices = []
        max_seq_length = 0
        for frame in srl_frames:
            predicate_index, arguments = frame
            if predicate_index >= len(flat_sentences_tokens):
                continue
            sentence_start, sentence_end = token_sentence_start_end_map[predicate_index]
            seq_labels = ["O" for _ in range(sentence_start, sentence_end+1)]
            seq_labels[predicate_index-sentence_start] = "B-V"
            for (start, end, arg_type) in arguments:
                if any([seq_labels[idx-sentence_start] != "O" for idx in range(start, end+1)]):
                    continue
                seq_labels[start-sentence_start] = "B-"+arg_type
                for i in range(start+1, end+1):
                    seq_labels[i-sentence_start] = "I-"+arg_type
            srl_seq_indices.append(list(range(sentence_start, sentence_end+1)))
            sentence_field = TextField(flat_sentences_tokens[sentence_start:sentence_end+1], token_indexers)
            seq_label_field = SequenceLabelField(seq_labels, sentence_field, label_namespace="srl_seq_labels")
            srl_seq_label_fields.append(seq_label_field)
            predicate_indices.append(predicate_index)
            srl_seq_labels.append(seq_labels)
            srl_seq_words.append([word for word in flattened_sentences[sentence_start:sentence_end+1]])
            max_seq_length = max(max_seq_length, sentence_end+1-sentence_start)
        if len(srl_seq_label_fields) > 0 and named_entity_spans is None:
            fields["srl_seq_labels"] = ListField(srl_seq_label_fields)
            srl_seq_indices = [seq+[-1 for _ in range(max_seq_length-len(seq))] for seq in srl_seq_indices]
            fields["srl_seq_indices"] = ArrayField(np.array(srl_seq_indices, dtype=np.int64), dtype=np.int64, padding_value=-1)
            fields["srl_seq_predicates"] = ArrayField(np.array(predicate_indices, dtype=np.int64), dtype=np.int64, padding_value=-1)
            metadata["srl_seq_labels"] = srl_seq_labels
            metadata["srl_seq_words"] = srl_seq_words
        metadata["srl_frames"] = filtered_srl_frames
        word_span_coincidence = []
        for token in range(len(flat_sentences_tokens)):
            for span_index in token_same_sentence_spans[token]:
                word_span_coincidence.append((token, span_index))
        fields["word_span_mask"] = AsymmetricAdjacencyField(word_span_coincidence, text_field, span_field, padding_value=0)
    if include_ner and named_entities is not None:
        remap = {"B-OTHER": "O", "I-OTHER": "O", "B-NUMBER": "B-QUANTITY", "I-NUMBER": "I-QUANTITY"}
        named_entities = [ent if ent not in remap else remap[ent] for ent in named_entities]
        if wordpiece_modeling_tokenizer is not None:
            converted_named_entities = ["O" for _ in flat_sentences_tokens]
            for index, ne in enumerate(named_entities):
                if ne != "O":
                    converted_named_entities[offsets[index][0]] = ne
                    for i in range(offsets[index][0]+1, offsets[index][1]+1):
                        converted_named_entities[i] = "I-"+ne[2:]
            named_entities = converted_named_entities
        fields["ner_seq_labels"] = SequenceLabelField(named_entities[:len(flat_sentences_tokens)], text_field, label_namespace="ner_seq_labels")
        metadata["ner_seq_labels"] = named_entities[:len(flat_sentences_tokens)]
        if named_entity_spans is not None:
            ner_span_label_map = {(start, end): label for (start, end, label) in named_entity_spans}
            ner_span_labels = [None for _ in span_index_map]
            for span in span_index_map:
                if span in ner_span_label_map:
                    ner_span_labels[span_index_map[span]] = ner_span_label_map[span]
                else:
                    ner_span_labels[span_index_map[span]] = "None"
            fields["ner_span_labels"] = SequenceLabelField(ner_span_labels, span_field, label_namespace="ner_span_labels")
    metadata["document_id"] = document_id

    return Instance(fields)