def test_tagged_corpus_statistics(): train_sentence = Sentence("I love Berlin.", labels=[Label("class_1")], use_tokenizer=segtok_tokenizer) dev_sentence = Sentence("The sun is shining.", labels=[Label("class_2")], use_tokenizer=segtok_tokenizer) test_sentence = Sentence("Berlin is sunny.", labels=[Label("class_1")], use_tokenizer=segtok_tokenizer) class_to_count_dict = Corpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 1 == class_to_count_dict["class_2"] tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def _obtain_labels( self, feature, sentences) -> (List[List[Label]], List[List[List[Label]]]): '\n Returns a tuple of two lists:\n - The first list corresponds to the most likely `Label` per token in each sentence.\n - The second list contains a probability distribution over all `Labels` for each token\n in a sentence for all sentences.\n ' lengths = [len(sentence.tokens) for sentence in sentences] tags = [] all_tags = [] for (feats, length) in zip(feature, lengths): if self.use_crf: (confidences, tag_seq, scores) = self._viterbi_decode(feats[:length]) else: tag_seq = [] confidences = [] scores = [] for backscore in feats[:length]: softmax = F.softmax(backscore, dim=0) (_, idx) = torch.max(backscore, 0) prediction = idx.item() tag_seq.append(prediction) confidences.append(softmax[prediction].item()) scores.append(softmax.tolist()) tags.append([ Label(self.tag_dictionary.get_item_for_index(tag), conf) for (conf, tag) in zip(confidences, tag_seq) ]) all_tags.append([[ Label(self.tag_dictionary.get_item_for_index(score_id), score) for (score_id, score) in enumerate(score_dist) ] for score_dist in scores]) return (tags, all_tags)
def test_tagged_corpus_statistics(): train_sentence = Sentence('I love Berlin.', labels=[Label('class_1')], use_tokenizer=True) dev_sentence = Sentence('The sun is shining.', labels=[Label('class_2')], use_tokenizer=True) test_sentence = Sentence('Berlin is sunny.', labels=[Label('class_1')], use_tokenizer=True) class_to_count_dict = TaggedCorpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert ('class_1' in class_to_count_dict) assert ('class_2' in class_to_count_dict) assert (2 == class_to_count_dict['class_1']) assert (1 == class_to_count_dict['class_2']) tokens_in_sentences = TaggedCorpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert (3 == len(tokens_in_sentences)) assert (4 == tokens_in_sentences[0]) assert (5 == tokens_in_sentences[1]) assert (4 == tokens_in_sentences[2])
def _obtain_labels( self, feature: torch.Tensor, sentences: List[Sentence], transitions: Optional[Parameter], get_all_tags: bool = False, ) -> (List[List[Label]], List[List[List[Label]]]): """ Returns a tuple of two lists: - The first list corresponds to the most likely `Label` per token in each sentence. - The second list contains a probability distribution over all `Labels` for each token in a sentence for all sentences. """ lengths: List[int] = [len(sentence.tokens) for sentence in sentences] tags = [] all_tags = [] feature = feature.detach().cpu() for feats, length in zip(feature, lengths): if self.use_crf: confidences, tag_seq, scores = self._viterbi_decode( feats[:length], all_scores=get_all_tags, transitions=transitions ) else: tag_seq = [] confidences = [] scores = [] for backscore in feats[:length]: softmax = F.softmax(backscore, dim=0) _, idx = torch.max(backscore, 0) prediction = idx.item() tag_seq.append(prediction) confidences.append(softmax[prediction].item()) scores.append(softmax.tolist()) tags.append( [ Label(self.tag_dictionary.get_item_for_index(tag), conf) for conf, tag in zip(confidences, tag_seq) ] ) if get_all_tags: all_tags.append( [ [ Label( self.tag_dictionary.get_item_for_index(score_id), score ) for score_id, score in enumerate(score_dist) ] for score_dist in scores ] ) return tags, all_tags
def test_label_set_confidence(): label = Label(data_point=None, value="class_1", score=3.2) assert 3.2 == label.score assert "class_1" == label.value label.score = 0.2 assert 0.2 == label.score
def test_label_set_confidence(): label = Label('class_1', 3.2) assert (1.0 == label.score) assert ('class_1' == label.value) label.score = 0.2 assert (0.2 == label.score)
def test_label_set_confidence(): label = Label("class_1", 3.2) assert 1.0 == label.score assert "class_1" == label.value label.score = 0.2 assert 0.2 == label.score
def test_tagged_corpus_make_label_dictionary(): sentence_1 = Sentence('sentence 1', labels=[Label('class_1')]) sentence_2 = Sentence('sentence 2', labels=[Label('class_2')]) sentence_3 = Sentence('sentence 3', labels=[Label('class_1')]) corpus = Corpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary() assert (2 == len(label_dict)) assert ('<unk>' not in label_dict.get_items()) assert ('class_1' in label_dict.get_items()) assert ('class_2' in label_dict.get_items())
def _obtain_labels( self, feature: torch.Tensor, batch_sentences: List[Sentence], transitions: Optional[np.ndarray], get_all_tags: bool, ) -> (List[List[Label]], List[List[List[Label]]]): """ Returns a tuple of two lists: - The first list corresponds to the most likely `Label` per token in each sentence. - The second list contains a probability distribution over all `Labels` for each token in a sentence for all sentences. """ lengths: List[int] = [ len(sentence.tokens) for sentence in batch_sentences ] tags = [] all_tags = [] feature = feature.cpu() if self.use_crf: feature = feature.numpy() else: for index, length in enumerate(lengths): feature[index, length:] = 0 softmax_batch = F.softmax(feature, dim=2).cpu() scores_batch, prediction_batch = torch.max(softmax_batch, dim=2) feature = zip(softmax_batch, scores_batch, prediction_batch) for feats, length in zip(feature, lengths): if self.use_crf: confidences, tag_seq, scores = self._viterbi_decode( feats=feats[:length], transitions=transitions, all_scores=get_all_tags, ) else: softmax, score, prediction = feats confidences = score[:length].tolist() tag_seq = prediction[:length].tolist() scores = softmax[:length].tolist() tags.append([ Label(self.tag_dictionary.get_item_for_index(tag), conf) for conf, tag in zip(confidences, tag_seq) ]) if get_all_tags: all_tags.append([[ Label(self.tag_dictionary.get_item_for_index(score_id), score) for score_id, score in enumerate(score_dist) ] for score_dist in scores]) return tags, all_tags
def test_label_set_confidence(): label = Label('class_1', 3.2) assert (0.0 == label.confidence) assert ('class_1' == label.name) label.confidence = 0.2 assert (0.2 == label.confidence) with pytest.raises(ValueError): label.name = ''
def test_tagged_corpus_make_label_dictionary(): sentence_1 = Sentence("sentence 1", labels=[Label("class_1")]) sentence_2 = Sentence("sentence 2", labels=[Label("class_2")]) sentence_3 = Sentence("sentence 3", labels=[Label("class_1")]) corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], []) label_dict = corpus.make_label_dictionary() assert 2 == len(label_dict) assert "<unk>" not in label_dict.get_items() assert "class_1" in label_dict.get_items() assert "class_2" in label_dict.get_items()
def forward_pass( self, sentences: Union[List[Sentence], Sentence], return_label_candidates: bool = False, ): if not isinstance(sentences, list): sentences = [sentences] self.embeddings.embed(sentences) names = self.embeddings.get_names() # get all tokens in this mini-batch all_tokens = [token for sentence in sentences for token in sentence] all_embeddings = [token.get_embedding(names) for token in all_tokens] embedding_tensor = torch.stack(all_embeddings) scores = self.linear(embedding_tensor) labels = [[token.get_tag(self.label_type).value] for token in all_tokens] if return_label_candidates: empty_label_candidates = [ Label(value=None, score=0.0) for token in all_tokens ] return scores, labels, all_tokens, empty_label_candidates return scores, labels
def _obtain_labels(self, feature, lengths) -> List[List[List[Label]]]: tags = [] for feats, length in zip(feature, lengths): if self.use_crf: loc_tag = [] for i, left_right in enumerate(self.feature_sizes): left, right = left_right local_feats = feats[:, left:right] confidences, tag_seq = self._viterbi_decode( local_feats[:length], i) loc_tag.append([ Label(self.tag_dictionaries[i].get_item_for_index(tag), conf) for conf, tag in zip(confidences, tag_seq) ]) tags.append(loc_tag) else: tag_seq = [] confidences = [] for backscore in feats[:length]: softmax = F.softmax(backscore, dim=0) _, idx = torch.max(backscore, 0) prediction = idx.item() tag_seq.append(prediction) confidences.append(softmax[prediction].item()) # tags.append([Label(self.tag_dictionary.get_item_for_index(tag), conf) # for conf, tag in zip(confidences, tag_seq)]) return tags
def forward_pass( self, sentences: Union[List[DataPoint], DataPoint], return_label_candidates: bool = False, ): # embed sentences self.document_embeddings.embed(sentences) # make tensor for all embedded sentences in batch embedding_names = self.document_embeddings.get_names() text_embedding_list = [ sentence.get_embedding(embedding_names).unsqueeze(0) for sentence in sentences ] text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device) # send through decoder to get logits scores = self.decoder(text_embedding_tensor) labels = [] for sentence in sentences: labels.append([ label.value for label in sentence.get_labels(self.label_type) ]) # minimal return is scores and labels return_tuple = (scores, labels) if return_label_candidates: label_candidates = [Label(value=None) for sentence in sentences] return_tuple += (sentences, label_candidates) return return_tuple
def _obtain_labels(self, feature, sentences) -> List[List[Label]]: sentences.sort(key=lambda x: len(x), reverse=True) lengths: List[int] = [len(sentence.tokens) for sentence in sentences] tags = [] for feats, length in zip(feature, lengths): if self.use_crf: confidences, tag_seq = self._viterbi_decode(feats[:length]) else: tag_seq = [] confidences = [] for backscore in feats[:length]: softmax = F.softmax(backscore, dim=0) _, idx = torch.max(backscore, 0) prediction = idx.item() tag_seq.append(prediction) confidences.append(softmax[prediction].item()) tags.append( [ Label(self.tag_dictionary.get_item_for_index(tag), conf) for conf, tag in zip(confidences, tag_seq) ] ) return tags
def predict( self, sentences: Union[Sentence, List[Sentence]], mini_batch_size: int = 32, embedding_storage_mode="none", ) -> List[Sentence]: with torch.no_grad(): if type(sentences) is Sentence: sentences = [sentences] filtered_sentences = self._filter_empty_sentences(sentences) # remove previous embeddings store_embeddings(filtered_sentences, "none") batches = [ filtered_sentences[x : x + mini_batch_size] for x in range(0, len(filtered_sentences), mini_batch_size) ] for batch in batches: scores = self.forward(batch) for (sentence, score) in zip(batch, scores.tolist()): sentence.labels = [Label(value=str(score[0]))] # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) return sentences
def _predict_label_prob(self, label_scores) -> List[Label]: softmax = torch.nn.functional.softmax(label_scores, dim=0) label_probs = [] for idx, conf in enumerate(softmax): label = self.label_dictionary.get_item_for_index(idx) label_probs.append(Label(label, conf.item())) return label_probs
def test_tagged_corpus_downsample(): sentence = Sentence("I love Berlin.", labels=[Label("class_1")], use_tokenizer=segtok_tokenizer) corpus: Corpus = Corpus( [ sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, ], [], [], ) assert 10 == len(corpus.train) corpus.downsample(percentage=0.3, only_downsample_train=True) assert 3 == len(corpus.train)
def forward_pass( self, sentences: Union[List[DataPoint], DataPoint], return_label_candidates: bool = False, ): self.embeddings.embed(sentences) names = self.embeddings.get_names() # get all tokens in this mini-batch all_tokens = [token for sentence in sentences for token in sentence] all_embeddings = [token.get_embedding(names) for token in all_tokens] embedding_tensor = torch.stack(all_embeddings) scores = self.linear(embedding_tensor) labels = [[token.get_tag(self.label_type).value] for token in all_tokens] # minimal return is scores and labels return_tuple = (scores, labels) if return_label_candidates: empty_label_candidates = [ Label(value=None, score=None) for token in all_tokens ] return_tuple += (all_tokens, empty_label_candidates) return return_tuple
def _standard_inference(self, features: torch.tensor, batch: list, probabilities_for_all_classes: bool): """ Softmax over emission scores from forward propagation. :param features: sentence tensor from forward propagation :param batch: list of sentence :param probabilities_for_all_classes: whether to return score for each tag in tag dictionary """ softmax_batch = F.softmax(features, dim=1).cpu() scores_batch, prediction_batch = torch.max(softmax_batch, dim=1) predictions = [] all_tags = [] for sentence in batch: scores = scores_batch[:len(sentence)] predictions_for_sentence = prediction_batch[:len(sentence)] predictions.append([ Label(self.tag_dictionary.get_item_for_index(prediction), score.item()) for token, score, prediction in zip( sentence, scores, predictions_for_sentence) ]) scores_batch = scores_batch[len(sentence):] prediction_batch = prediction_batch[len(sentence):] if probabilities_for_all_classes: lengths = [len(sentence) for sentence in batch] all_tags = self._all_scores_for_token(softmax_batch, lengths) return predictions, all_tags
def forward_pass( self, datapairs: Union[List[DataPoint], DataPoint], return_label_candidates: bool = False, ): if isinstance(datapairs, DataPair): datapairs = [datapairs] embedding_names = self.document_embeddings.get_names() if self.embed_separately: # embed both sentences seperately, concatenate the resulting vectors first_elements = [pair.first for pair in datapairs] second_elements = [pair.second for pair in datapairs] self.document_embeddings.embed(first_elements) self.document_embeddings.embed(second_elements) text_embedding_list = [ torch.cat([ a.get_embedding(embedding_names), b.get_embedding(embedding_names) ], 0).unsqueeze(0) for (a, b) in zip(first_elements, second_elements) ] else: # concatenate the sentences and embed together concatenated_sentences = [ Sentence(pair.first.to_tokenized_string() + self.sep + pair.second.to_tokenized_string(), use_tokenizer=False) for pair in datapairs ] self.document_embeddings.embed(concatenated_sentences) text_embedding_list = [ sentence.get_embedding(embedding_names).unsqueeze(0) for sentence in concatenated_sentences ] text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device) # linear layer scores = self.decoder(text_embedding_tensor) labels = [] for pair in datapairs: labels.append( [label.value for label in pair.get_labels(self.label_type)]) # minimal return is scores and labels return_tuple = (scores, labels) if return_label_candidates: label_candidates = [Label(value=None) for pair in datapairs] return_tuple += (datapairs, label_candidates) return return_tuple
def test_tagged_corpus_downsample(): sentence = Sentence(u'I love Berlin.', labels=[ Label(u'class_1')], use_tokenizer=True) corpus = TaggedCorpus([sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence], [], []) assert (10 == len(corpus.train)) corpus.downsample(percentage=0.3, only_downsample_train=True) assert (3 == len(corpus.train))
def _get_multi_label(self, label_scores) -> List[Label]: labels = [] sigmoid = torch.nn.Sigmoid() results = list(map((lambda x: sigmoid(x)), label_scores)) for (idx, conf) in enumerate(results): if (conf > self.multi_label_threshold): label = self.label_dictionary.get_item_for_index(idx) labels.append(Label(label, conf.item())) return labels
def _get_multi_label(self, label_scores) -> List[Label]: labels = [] for idx, conf in enumerate(label_scores): if conf > self.multi_label_threshold: label = self.label_dictionary.get_item_for_index(idx) labels.append(Label(label, conf.item())) return labels
def _predict_sentence(models, sentence): with torch.no_grad(): tags = [] # [List[List[Label]]] for i, model in enumerate(models): seq = deepcopy(sentence) model.predict(seq) batch_tag = [token.get_tag("ner").value for token in seq] tags.append(batch_tag) tags = [Label(vote(token)) for token in np.transpose(tags, (1, 0))] return tags
def make_flair_sentences(X, y=None, tag_type=None): sentences = [Sentence(' '.join(tokens)) for tokens in X] if y is not None: assert tag_type, 'Tag type is required if tags (y) are defined' for sentence, tags in zip(sentences, y): for (token, tag) in zip(sentence.tokens, tags): token.add_tag_label(tag_type, Label(tag)) return sentences
def _all_scores_for_token(self, scores: torch.tensor, lengths: torch.tensor): """ Returns all scores for each tag in tag dictionary. :param scores: Scores for current sentence. """ scores = scores.numpy() prob_tags_per_sentence = [] for scores_sentence, length in zip(scores, lengths.values): scores_sentence = scores_sentence[:length] prob_tags_per_sentence.append([[ Label(self.tag_dictionary.get_item_for_index(score_id), score) for score_id, score in enumerate(score_dist) ] for score_dist in scores_sentence]) return prob_tags_per_sentence
def _all_scores_for_token(self, scores: torch.Tensor, lengths: List[int]): """ Returns all scores for each tag in tag dictionary. :param scores: Scores for current sentence. """ scores = scores.numpy() prob_all_tags = [[ Label(self.label_dictionary.get_item_for_index(score_id), score) for score_id, score in enumerate(score_dist) ] for score_dist in scores] prob_tags_per_sentence = [] previous = 0 for length in lengths: prob_tags_per_sentence.append(prob_all_tags[previous:previous + length]) previous = length return prob_tags_per_sentence
def _obtain_labels(self, feature, lengths): tags = [] for (feats, length) in zip(feature, lengths): if self.use_crf: (confidences, tag_seq) = self._viterbi_decode(feats[:length]) else: tag_seq = [] confidences = [] for backscore in feats[:length]: softmax = F.softmax(backscore, dim=0) (_, idx) = torch.max(backscore, 0) prediction = idx.item() tag_seq.append(prediction) confidences.append(softmax[prediction].item()) tags.append([ Label(self.tag_dictionary.get_item_for_index(tag), conf) for (conf, tag) in zip(confidences, tag_seq) ]) return tags
def predict(self, sentences: Union[(Sentence, List[Sentence])], mini_batch_size: int = 32, embedding_storage_mode='none') -> List[Sentence]: with torch.no_grad(): if (type(sentences) is Sentence): sentences = [sentences] filtered_sentences = self._filter_empty_sentences(sentences) store_embeddings(filtered_sentences, 'none') batches = [ filtered_sentences[x:(x + mini_batch_size)] for x in range(0, len(filtered_sentences), mini_batch_size) ] for batch in batches: scores = self.forward(batch) for (sentence, score) in zip(batch, scores.tolist()): sentence.labels = [Label(value=str(score[0]))] store_embeddings(batch, storage_mode=embedding_storage_mode) return sentences