예제 #1
0
파일: test_data.py 프로젝트: zwl0591/flair
def test_tagged_corpus_statistics():
    train_sentence = Sentence("I love Berlin.",
                              labels=[Label("class_1")],
                              use_tokenizer=segtok_tokenizer)
    dev_sentence = Sentence("The sun is shining.",
                            labels=[Label("class_2")],
                            use_tokenizer=segtok_tokenizer)
    test_sentence = Sentence("Berlin is sunny.",
                             labels=[Label("class_1")],
                             use_tokenizer=segtok_tokenizer)

    class_to_count_dict = Corpus._get_class_to_count(
        [train_sentence, dev_sentence, test_sentence])

    assert "class_1" in class_to_count_dict
    assert "class_2" in class_to_count_dict
    assert 2 == class_to_count_dict["class_1"]
    assert 1 == class_to_count_dict["class_2"]

    tokens_in_sentences = Corpus._get_tokens_per_sentence(
        [train_sentence, dev_sentence, test_sentence])

    assert 3 == len(tokens_in_sentences)
    assert 4 == tokens_in_sentences[0]
    assert 5 == tokens_in_sentences[1]
    assert 4 == tokens_in_sentences[2]
예제 #2
0
 def _obtain_labels(
         self, feature,
         sentences) -> (List[List[Label]], List[List[List[Label]]]):
     '\n        Returns a tuple of two lists:\n         - The first list corresponds to the most likely `Label` per token in each sentence.\n         - The second list contains a probability distribution over all `Labels` for each token\n           in a sentence for all sentences.\n        '
     lengths = [len(sentence.tokens) for sentence in sentences]
     tags = []
     all_tags = []
     for (feats, length) in zip(feature, lengths):
         if self.use_crf:
             (confidences, tag_seq,
              scores) = self._viterbi_decode(feats[:length])
         else:
             tag_seq = []
             confidences = []
             scores = []
             for backscore in feats[:length]:
                 softmax = F.softmax(backscore, dim=0)
                 (_, idx) = torch.max(backscore, 0)
                 prediction = idx.item()
                 tag_seq.append(prediction)
                 confidences.append(softmax[prediction].item())
                 scores.append(softmax.tolist())
         tags.append([
             Label(self.tag_dictionary.get_item_for_index(tag), conf)
             for (conf, tag) in zip(confidences, tag_seq)
         ])
         all_tags.append([[
             Label(self.tag_dictionary.get_item_for_index(score_id), score)
             for (score_id, score) in enumerate(score_dist)
         ] for score_dist in scores])
     return (tags, all_tags)
예제 #3
0
def test_tagged_corpus_statistics():
    train_sentence = Sentence('I love Berlin.',
                              labels=[Label('class_1')],
                              use_tokenizer=True)
    dev_sentence = Sentence('The sun is shining.',
                            labels=[Label('class_2')],
                            use_tokenizer=True)
    test_sentence = Sentence('Berlin is sunny.',
                             labels=[Label('class_1')],
                             use_tokenizer=True)

    class_to_count_dict = TaggedCorpus._get_class_to_count(
        [train_sentence, dev_sentence, test_sentence])

    assert ('class_1' in class_to_count_dict)
    assert ('class_2' in class_to_count_dict)
    assert (2 == class_to_count_dict['class_1'])
    assert (1 == class_to_count_dict['class_2'])

    tokens_in_sentences = TaggedCorpus._get_tokens_per_sentence(
        [train_sentence, dev_sentence, test_sentence])

    assert (3 == len(tokens_in_sentences))
    assert (4 == tokens_in_sentences[0])
    assert (5 == tokens_in_sentences[1])
    assert (4 == tokens_in_sentences[2])
예제 #4
0
    def _obtain_labels(
        self,
        feature: torch.Tensor,
        sentences: List[Sentence],
        transitions: Optional[Parameter],
        get_all_tags: bool = False,
    ) -> (List[List[Label]], List[List[List[Label]]]):
        """
        Returns a tuple of two lists:
         - The first list corresponds to the most likely `Label` per token in each sentence.
         - The second list contains a probability distribution over all `Labels` for each token
           in a sentence for all sentences.
        """

        lengths: List[int] = [len(sentence.tokens) for sentence in sentences]

        tags = []
        all_tags = []
        feature = feature.detach().cpu()

        for feats, length in zip(feature, lengths):
            if self.use_crf:
                confidences, tag_seq, scores = self._viterbi_decode(
                    feats[:length], all_scores=get_all_tags, transitions=transitions
                )
            else:
                tag_seq = []
                confidences = []
                scores = []
                for backscore in feats[:length]:
                    softmax = F.softmax(backscore, dim=0)
                    _, idx = torch.max(backscore, 0)
                    prediction = idx.item()
                    tag_seq.append(prediction)
                    confidences.append(softmax[prediction].item())
                    scores.append(softmax.tolist())

            tags.append(
                [
                    Label(self.tag_dictionary.get_item_for_index(tag), conf)
                    for conf, tag in zip(confidences, tag_seq)
                ]
            )

            if get_all_tags:
                all_tags.append(
                    [
                        [
                            Label(
                                self.tag_dictionary.get_item_for_index(score_id), score
                            )
                            for score_id, score in enumerate(score_dist)
                        ]
                        for score_dist in scores
                    ]
                )

        return tags, all_tags
예제 #5
0
def test_label_set_confidence():
    label = Label(data_point=None, value="class_1", score=3.2)

    assert 3.2 == label.score
    assert "class_1" == label.value

    label.score = 0.2

    assert 0.2 == label.score
예제 #6
0
def test_label_set_confidence():
    label = Label('class_1', 3.2)

    assert (1.0 == label.score)
    assert ('class_1' == label.value)

    label.score = 0.2

    assert (0.2 == label.score)
예제 #7
0
def test_label_set_confidence():
    label = Label("class_1", 3.2)

    assert 1.0 == label.score
    assert "class_1" == label.value

    label.score = 0.2

    assert 0.2 == label.score
예제 #8
0
def test_tagged_corpus_make_label_dictionary():
    sentence_1 = Sentence('sentence 1', labels=[Label('class_1')])
    sentence_2 = Sentence('sentence 2', labels=[Label('class_2')])
    sentence_3 = Sentence('sentence 3', labels=[Label('class_1')])
    corpus = Corpus([sentence_1, sentence_2, sentence_3], [], [])
    label_dict = corpus.make_label_dictionary()
    assert (2 == len(label_dict))
    assert ('<unk>' not in label_dict.get_items())
    assert ('class_1' in label_dict.get_items())
    assert ('class_2' in label_dict.get_items())
예제 #9
0
    def _obtain_labels(
        self,
        feature: torch.Tensor,
        batch_sentences: List[Sentence],
        transitions: Optional[np.ndarray],
        get_all_tags: bool,
    ) -> (List[List[Label]], List[List[List[Label]]]):
        """
        Returns a tuple of two lists:
         - The first list corresponds to the most likely `Label` per token in each sentence.
         - The second list contains a probability distribution over all `Labels` for each token
           in a sentence for all sentences.
        """

        lengths: List[int] = [
            len(sentence.tokens) for sentence in batch_sentences
        ]

        tags = []
        all_tags = []
        feature = feature.cpu()
        if self.use_crf:
            feature = feature.numpy()
        else:
            for index, length in enumerate(lengths):
                feature[index, length:] = 0
            softmax_batch = F.softmax(feature, dim=2).cpu()
            scores_batch, prediction_batch = torch.max(softmax_batch, dim=2)
            feature = zip(softmax_batch, scores_batch, prediction_batch)

        for feats, length in zip(feature, lengths):
            if self.use_crf:
                confidences, tag_seq, scores = self._viterbi_decode(
                    feats=feats[:length],
                    transitions=transitions,
                    all_scores=get_all_tags,
                )
            else:
                softmax, score, prediction = feats
                confidences = score[:length].tolist()
                tag_seq = prediction[:length].tolist()
                scores = softmax[:length].tolist()

            tags.append([
                Label(self.tag_dictionary.get_item_for_index(tag), conf)
                for conf, tag in zip(confidences, tag_seq)
            ])

            if get_all_tags:
                all_tags.append([[
                    Label(self.tag_dictionary.get_item_for_index(score_id),
                          score) for score_id, score in enumerate(score_dist)
                ] for score_dist in scores])

        return tags, all_tags
예제 #10
0
파일: test_data.py 프로젝트: woocoder/flair
def test_label_set_confidence():
    label = Label('class_1', 3.2)

    assert (0.0 == label.confidence)
    assert ('class_1' == label.name)

    label.confidence = 0.2

    assert (0.2 == label.confidence)

    with pytest.raises(ValueError):
        label.name = ''
예제 #11
0
def test_tagged_corpus_make_label_dictionary():
    sentence_1 = Sentence("sentence 1", labels=[Label("class_1")])
    sentence_2 = Sentence("sentence 2", labels=[Label("class_2")])
    sentence_3 = Sentence("sentence 3", labels=[Label("class_1")])

    corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], [])

    label_dict = corpus.make_label_dictionary()

    assert 2 == len(label_dict)
    assert "<unk>" not in label_dict.get_items()
    assert "class_1" in label_dict.get_items()
    assert "class_2" in label_dict.get_items()
예제 #12
0
    def forward_pass(
        self,
        sentences: Union[List[Sentence], Sentence],
        return_label_candidates: bool = False,
    ):
        if not isinstance(sentences, list):
            sentences = [sentences]

        self.embeddings.embed(sentences)

        names = self.embeddings.get_names()

        # get all tokens in this mini-batch
        all_tokens = [token for sentence in sentences for token in sentence]

        all_embeddings = [token.get_embedding(names) for token in all_tokens]

        embedding_tensor = torch.stack(all_embeddings)

        scores = self.linear(embedding_tensor)

        labels = [[token.get_tag(self.label_type).value]
                  for token in all_tokens]

        if return_label_candidates:
            empty_label_candidates = [
                Label(value=None, score=0.0) for token in all_tokens
            ]
            return scores, labels, all_tokens, empty_label_candidates

        return scores, labels
예제 #13
0
    def _obtain_labels(self, feature, lengths) -> List[List[List[Label]]]:
        tags = []

        for feats, length in zip(feature, lengths):
            if self.use_crf:
                loc_tag = []
                for i, left_right in enumerate(self.feature_sizes):
                    left, right = left_right
                    local_feats = feats[:, left:right]
                    confidences, tag_seq = self._viterbi_decode(
                        local_feats[:length], i)

                    loc_tag.append([
                        Label(self.tag_dictionaries[i].get_item_for_index(tag),
                              conf) for conf, tag in zip(confidences, tag_seq)
                    ])
                tags.append(loc_tag)
            else:
                tag_seq = []
                confidences = []
                for backscore in feats[:length]:
                    softmax = F.softmax(backscore, dim=0)
                    _, idx = torch.max(backscore, 0)
                    prediction = idx.item()
                    tag_seq.append(prediction)
                    confidences.append(softmax[prediction].item())

            # tags.append([Label(self.tag_dictionary.get_item_for_index(tag), conf)
            #              for conf, tag in zip(confidences, tag_seq)])

        return tags
예제 #14
0
    def forward_pass(
        self,
        sentences: Union[List[DataPoint], DataPoint],
        return_label_candidates: bool = False,
    ):

        # embed sentences
        self.document_embeddings.embed(sentences)

        # make tensor for all embedded sentences in batch
        embedding_names = self.document_embeddings.get_names()
        text_embedding_list = [
            sentence.get_embedding(embedding_names).unsqueeze(0)
            for sentence in sentences
        ]
        text_embedding_tensor = torch.cat(text_embedding_list,
                                          0).to(flair.device)

        # send through decoder to get logits
        scores = self.decoder(text_embedding_tensor)

        labels = []
        for sentence in sentences:
            labels.append([
                label.value for label in sentence.get_labels(self.label_type)
            ])

        # minimal return is scores and labels
        return_tuple = (scores, labels)

        if return_label_candidates:
            label_candidates = [Label(value=None) for sentence in sentences]
            return_tuple += (sentences, label_candidates)

        return return_tuple
예제 #15
0
    def _obtain_labels(self, feature, sentences) -> List[List[Label]]:

        sentences.sort(key=lambda x: len(x), reverse=True)

        lengths: List[int] = [len(sentence.tokens) for sentence in sentences]

        tags = []

        for feats, length in zip(feature, lengths):
            if self.use_crf:
                confidences, tag_seq = self._viterbi_decode(feats[:length])
            else:
                tag_seq = []
                confidences = []
                for backscore in feats[:length]:
                    softmax = F.softmax(backscore, dim=0)
                    _, idx = torch.max(backscore, 0)
                    prediction = idx.item()
                    tag_seq.append(prediction)
                    confidences.append(softmax[prediction].item())

            tags.append(
                [
                    Label(self.tag_dictionary.get_item_for_index(tag), conf)
                    for conf, tag in zip(confidences, tag_seq)
                ]
            )

        return tags
예제 #16
0
    def predict(
        self,
        sentences: Union[Sentence, List[Sentence]],
        mini_batch_size: int = 32,
        embedding_storage_mode="none",
    ) -> List[Sentence]:

        with torch.no_grad():
            if type(sentences) is Sentence:
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            # remove previous embeddings
            store_embeddings(filtered_sentences, "none")

            batches = [
                filtered_sentences[x : x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            for batch in batches:
                scores = self.forward(batch)

                for (sentence, score) in zip(batch, scores.tolist()):
                    sentence.labels = [Label(value=str(score[0]))]

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

            return sentences
예제 #17
0
 def _predict_label_prob(self, label_scores) -> List[Label]:
     softmax = torch.nn.functional.softmax(label_scores, dim=0)
     label_probs = []
     for idx, conf in enumerate(softmax):
         label = self.label_dictionary.get_item_for_index(idx)
         label_probs.append(Label(label, conf.item()))
     return label_probs
예제 #18
0
def test_tagged_corpus_downsample():
    sentence = Sentence("I love Berlin.",
                        labels=[Label("class_1")],
                        use_tokenizer=segtok_tokenizer)

    corpus: Corpus = Corpus(
        [
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
        ],
        [],
        [],
    )

    assert 10 == len(corpus.train)

    corpus.downsample(percentage=0.3, only_downsample_train=True)

    assert 3 == len(corpus.train)
예제 #19
0
    def forward_pass(
        self,
        sentences: Union[List[DataPoint], DataPoint],
        return_label_candidates: bool = False,
    ):

        self.embeddings.embed(sentences)

        names = self.embeddings.get_names()

        # get all tokens in this mini-batch
        all_tokens = [token for sentence in sentences for token in sentence]

        all_embeddings = [token.get_embedding(names) for token in all_tokens]

        embedding_tensor = torch.stack(all_embeddings)

        scores = self.linear(embedding_tensor)

        labels = [[token.get_tag(self.label_type).value]
                  for token in all_tokens]

        # minimal return is scores and labels
        return_tuple = (scores, labels)

        if return_label_candidates:
            empty_label_candidates = [
                Label(value=None, score=None) for token in all_tokens
            ]
            return_tuple += (all_tokens, empty_label_candidates)

        return return_tuple
예제 #20
0
    def _standard_inference(self, features: torch.tensor, batch: list,
                            probabilities_for_all_classes: bool):
        """
        Softmax over emission scores from forward propagation.
        :param features: sentence tensor from forward propagation
        :param batch: list of sentence
        :param probabilities_for_all_classes: whether to return score for each tag in tag dictionary
        """
        softmax_batch = F.softmax(features, dim=1).cpu()
        scores_batch, prediction_batch = torch.max(softmax_batch, dim=1)
        predictions = []
        all_tags = []

        for sentence in batch:
            scores = scores_batch[:len(sentence)]
            predictions_for_sentence = prediction_batch[:len(sentence)]
            predictions.append([
                Label(self.tag_dictionary.get_item_for_index(prediction),
                      score.item()) for token, score, prediction in zip(
                          sentence, scores, predictions_for_sentence)
            ])
            scores_batch = scores_batch[len(sentence):]
            prediction_batch = prediction_batch[len(sentence):]

        if probabilities_for_all_classes:
            lengths = [len(sentence) for sentence in batch]
            all_tags = self._all_scores_for_token(softmax_batch, lengths)

        return predictions, all_tags
예제 #21
0
    def forward_pass(
        self,
        datapairs: Union[List[DataPoint], DataPoint],
        return_label_candidates: bool = False,
    ):

        if isinstance(datapairs, DataPair):
            datapairs = [datapairs]

        embedding_names = self.document_embeddings.get_names()

        if self.embed_separately:  # embed both sentences seperately, concatenate the resulting vectors
            first_elements = [pair.first for pair in datapairs]
            second_elements = [pair.second for pair in datapairs]

            self.document_embeddings.embed(first_elements)
            self.document_embeddings.embed(second_elements)

            text_embedding_list = [
                torch.cat([
                    a.get_embedding(embedding_names),
                    b.get_embedding(embedding_names)
                ], 0).unsqueeze(0)
                for (a, b) in zip(first_elements, second_elements)
            ]

        else:  # concatenate the sentences and embed together
            concatenated_sentences = [
                Sentence(pair.first.to_tokenized_string() + self.sep +
                         pair.second.to_tokenized_string(),
                         use_tokenizer=False) for pair in datapairs
            ]

            self.document_embeddings.embed(concatenated_sentences)

            text_embedding_list = [
                sentence.get_embedding(embedding_names).unsqueeze(0)
                for sentence in concatenated_sentences
            ]

        text_embedding_tensor = torch.cat(text_embedding_list,
                                          0).to(flair.device)

        # linear layer
        scores = self.decoder(text_embedding_tensor)

        labels = []
        for pair in datapairs:
            labels.append(
                [label.value for label in pair.get_labels(self.label_type)])

        # minimal return is scores and labels
        return_tuple = (scores, labels)

        if return_label_candidates:
            label_candidates = [Label(value=None) for pair in datapairs]
            return_tuple += (datapairs, label_candidates)

        return return_tuple
예제 #22
0
def test_tagged_corpus_downsample():
    sentence = Sentence(u'I love Berlin.', labels=[
                        Label(u'class_1')], use_tokenizer=True)
    corpus = TaggedCorpus([sentence, sentence, sentence, sentence, sentence,
                           sentence, sentence, sentence, sentence, sentence], [], [])
    assert (10 == len(corpus.train))
    corpus.downsample(percentage=0.3, only_downsample_train=True)
    assert (3 == len(corpus.train))
예제 #23
0
 def _get_multi_label(self, label_scores) -> List[Label]:
     labels = []
     sigmoid = torch.nn.Sigmoid()
     results = list(map((lambda x: sigmoid(x)), label_scores))
     for (idx, conf) in enumerate(results):
         if (conf > self.multi_label_threshold):
             label = self.label_dictionary.get_item_for_index(idx)
             labels.append(Label(label, conf.item()))
     return labels
    def _get_multi_label(self, label_scores) -> List[Label]:
        labels = []

        for idx, conf in enumerate(label_scores):
            if conf > self.multi_label_threshold:
                label = self.label_dictionary.get_item_for_index(idx)
                labels.append(Label(label, conf.item()))

        return labels
예제 #25
0
def _predict_sentence(models, sentence):
    with torch.no_grad():
        tags = []  # [List[List[Label]]]
        for i, model in enumerate(models):
            seq = deepcopy(sentence)
            model.predict(seq)
            batch_tag = [token.get_tag("ner").value for token in seq]
            tags.append(batch_tag)
        tags = [Label(vote(token)) for token in np.transpose(tags, (1, 0))]

    return tags
예제 #26
0
def make_flair_sentences(X, y=None, tag_type=None):
    sentences = [Sentence(' '.join(tokens)) for tokens in X]

    if y is not None:
        assert tag_type, 'Tag type is required if tags (y) are defined'

        for sentence, tags in zip(sentences, y):
            for (token, tag) in zip(sentence.tokens, tags):
                token.add_tag_label(tag_type, Label(tag))

    return sentences
예제 #27
0
파일: viterbi.py 프로젝트: symeneses/flair
 def _all_scores_for_token(self, scores: torch.tensor,
                           lengths: torch.tensor):
     """
     Returns all scores for each tag in tag dictionary.
     :param scores: Scores for current sentence.
     """
     scores = scores.numpy()
     prob_tags_per_sentence = []
     for scores_sentence, length in zip(scores, lengths.values):
         scores_sentence = scores_sentence[:length]
         prob_tags_per_sentence.append([[
             Label(self.tag_dictionary.get_item_for_index(score_id), score)
             for score_id, score in enumerate(score_dist)
         ] for score_dist in scores_sentence])
     return prob_tags_per_sentence
예제 #28
0
    def _all_scores_for_token(self, scores: torch.Tensor, lengths: List[int]):
        """
        Returns all scores for each tag in tag dictionary.
        :param scores: Scores for current sentence.
        """
        scores = scores.numpy()
        prob_all_tags = [[
            Label(self.label_dictionary.get_item_for_index(score_id), score)
            for score_id, score in enumerate(score_dist)
        ] for score_dist in scores]

        prob_tags_per_sentence = []
        previous = 0
        for length in lengths:
            prob_tags_per_sentence.append(prob_all_tags[previous:previous +
                                                        length])
            previous = length
        return prob_tags_per_sentence
예제 #29
0
 def _obtain_labels(self, feature, lengths):
     tags = []
     for (feats, length) in zip(feature, lengths):
         if self.use_crf:
             (confidences, tag_seq) = self._viterbi_decode(feats[:length])
         else:
             tag_seq = []
             confidences = []
             for backscore in feats[:length]:
                 softmax = F.softmax(backscore, dim=0)
                 (_, idx) = torch.max(backscore, 0)
                 prediction = idx.item()
                 tag_seq.append(prediction)
                 confidences.append(softmax[prediction].item())
         tags.append([
             Label(self.tag_dictionary.get_item_for_index(tag), conf)
             for (conf, tag) in zip(confidences, tag_seq)
         ])
     return tags
예제 #30
0
 def predict(self,
             sentences: Union[(Sentence, List[Sentence])],
             mini_batch_size: int = 32,
             embedding_storage_mode='none') -> List[Sentence]:
     with torch.no_grad():
         if (type(sentences) is Sentence):
             sentences = [sentences]
         filtered_sentences = self._filter_empty_sentences(sentences)
         store_embeddings(filtered_sentences, 'none')
         batches = [
             filtered_sentences[x:(x + mini_batch_size)]
             for x in range(0, len(filtered_sentences), mini_batch_size)
         ]
         for batch in batches:
             scores = self.forward(batch)
             for (sentence, score) in zip(batch, scores.tolist()):
                 sentence.labels = [Label(value=str(score[0]))]
             store_embeddings(batch, storage_mode=embedding_storage_mode)
         return sentences