Exemplo n.º 1
0
def run_zero_shot(train_tweets, train_y, val_tweets, val_y):
    """
    Performs the training of the zero shot learning model

    @param train_tweets: the tweets that will be used for training
    @param train_y: the training labels
    @param val_tweets: the tweets that will be used for validation
    @param val_y: the validation labels
    @return: None
    """
    # 1. Load our pre-trained TARS model for English
    print("Zero shot")
    # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt
    tars = TARSClassifier.load(
        os.path.join(os.path.dirname(__file__), "..", "..", "saved_models",
                     "tars-base.pt"))

    train_tweets["output"] = train_y.iloc[:]
    train = train_tweets.apply(create_sentences, axis=1).tolist()
    train = SentenceDataset(train)

    val_tweets["output"] = val_y.iloc[:]
    val = val_tweets.apply(create_sentences, axis=1).tolist()
    val = SentenceDataset(val)

    corpus = Corpus(train=train, test=val)

    tars.add_and_switch_to_new_task(
        "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary())

    trainer = ModelTrainer(tars, corpus)

    # 4. train model
    trainer.train(
        base_path='../../data/zero_shot',  # path to store the model artifacts
        learning_rate=0.02,  # use very small learning rate
        mini_batch_size=16,  # small mini-batch size since corpus is tiny
        max_epochs=10,  # terminate after 10 epochs
    )

    print("DONE TRAINING")
    tars = TARSClassifier.load('../../model/zero_shot/final-model.pt')

    val_tweets["pred"] = val_tweets.apply(predict_few_shot,
                                          args=(tars, ),
                                          axis=1)
    val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1
                                                  if x == "positive" else -1)

    pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction'])
    pred.index += 1
    pred.insert(0, 'Id', pred.index)

    pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
Exemplo n.º 2
0
    def to_corpus(self, cache=False) -> Corpus:
        data_file = (TRANSFORMED_ROOT / self.filename).with_suffix(".pickle.xz")
        if data_file.exists():
            with lzma.open(data_file) as fd:
                return pickle.load(fd)

        dataset = Tox21().to_df()

        def plain_tokenizer(text: str) -> Iterable[Token]:
            res = []
            for tok in text.split():
                res.append(Token(tok))
            return res

        def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]:
            for _, row in dataset.iterrows():
                res = encoder(row.smiles)
                if not res:
                    continue
                res = res.replace("]", "] ").replace(".", "DOT ")
                sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer)
                for col, val in row.items():
                    if isinstance(val, float):
                        if val == 1.0:
                            sent.add_label(None, col.replace(" ", "_") + "_P ")
                        if val == 0.0:
                            sent.add_label(None, col.replace(" ", "_") + "_N ")
                yield sent

        train = dataset.sample(frac=0.7, random_state=18)
        dataset = dataset.drop(train.index)
        dev = dataset.sample(frac=0.333334, random_state=18)
        test = dataset.drop(dev.index)

        train = SentenceDataset(list(iterate_dataframe(train)))
        dev = SentenceDataset(list(iterate_dataframe(dev)))
        test = SentenceDataset(list(iterate_dataframe(test)))

        corpus = Corpus(train, dev, test, "Molecules")

        if cache:
            TRANSFORMED_ROOT.mkdir(parents=True, exist_ok=True)
            with lzma.open(data_file, "wb") as fd:
                pickle.dump(corpus, fd)

        return corpus
Exemplo n.º 3
0
 def _predict(self, sentences, tagger):
     tokenizer = SegtokTokenizer()
     dataset = SentenceDataset(
         [Sentence(text, tokenizer) for text in sentences])
     tagger.predict(dataset,
                    mini_batch_size=self.mini_batch_size,
                    embedding_storage_mode=self.embedding_storage_mode,
                    verbose=self.verbose)
     return [sentence for sentence in dataset]
def spelling_aug(corpus):
    aug = naw.SpellingAug()
   # augmented_sentences = []

    # go through all train and dev sentences
    for sentence in corpus.train:
        augmented_texts = aug.augment(sentence, n=3)

    corpus = Corpus(train=SentenceDataset(augmented_texts),
                    dev=corpus.dev,
                    test=corpus.test)
    return corpus
Exemplo n.º 5
0
    def train(self):
        from flair.data import Corpus
        from flair.datasets import SentenceDataset
        from flair.data import Sentence

        self.classes = utils.read_class_titles(settings.CAT_DEPTH)
        self.classes['NOCAT'] = 'NOCAT'

        train = SentenceDataset([
            Sentence(row['titlen']).add_label('law_topic',
                                              self.classes[row['cat1']])
            for i, row in self.df_train.iterrows()
        ])

        # make a corpus with train and test split
        self.corpus = Corpus(train=train, dev=train)

        # 1. load base TARS
        tars = self._load_pretained_model()

        # 2. make the model aware of the desired set of labels from the new corpus
        tars.add_and_switch_to_new_task(
            "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary())

        # 3. initialize the text classifier trainer with your corpus
        from flair.trainers import ModelTrainer
        trainer = ModelTrainer(tars, self.corpus)

        # 4. train model
        path = settings.WORKING_DIR
        if 1:
            trainer.train(
                base_path=path,
                # path to store the model artifacts
                learning_rate=5e-2,  # 5ep, 0.2 bad; 5ep with 0.1 looks ok.
                mini_batch_size=settings.MINIBATCH,
                # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
                max_epochs=settings.EPOCHS,  # terminate after 10 epochs
                train_with_dev=False,
                save_final_model=False,
                param_selection_mode=True,  # True to avoid model saves
                shuffle=False,  # Already done
            )

        # from flair.models.text_classification_model import TARSClassifier
        # self.model = TARSClassifier.load(
        #     os.path.join(path, 'best-model.pt')
        # )

        self.model = tars
Exemplo n.º 6
0
    def to_corpus(self) -> Corpus:
        dataset = Tox21().to_df()

        def plain_tokenizer(text: str) -> Iterable[Token]:
            res = []
            for tok in text.split():
                res.append(Token(tok))
            return res

        def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]:
            for _, row in dataset.iterrows():
                res = encoder(row.smiles)
                if not res:
                    continue
                res = res.replace("]", "] ").replace(".", "DOT ")
                sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer)
                for col, val in row.items():
                    if isinstance(val, float):
                        if val == 1.0:
                            sent.add_label(None, col.replace(" ", "_") + "_P ")
                        if val == 0.0:
                            sent.add_label(None, col.replace(" ", "_") + "_N ")
                yield sent

        train = dataset.sample(frac=0.7, random_state=18)
        dataset = dataset.drop(train.index)
        dev = dataset.sample(frac=0.333334, random_state=18)
        test = dataset.drop(dev.index)

        train = SentenceDataset(list(iterate_dataframe(train)))
        dev = SentenceDataset(list(iterate_dataframe(dev)))
        test = SentenceDataset(list(iterate_dataframe(test)))

        corpus = Corpus(train, dev, test, "Molecules")

        return corpus
def capitalization_aug(corpus):
    augmented_sentences = []

    # go through all train and dev sentences
    for sentence in corpus.train:
        augmented_sentence: Sentence = Sentence()
        for token in sentence:
            token.text = token.text.lower()
            augmented_sentence.add_token(token)


                # append to augmented sentences
        if len(augmented_sentence) > 0:
            augmented_sentences.append(augmented_sentence)

    # make a new corpus with the augmented sentences
    corpus = Corpus(train=SentenceDataset(augmented_sentences),
                    dev=corpus.dev,
                    test=corpus.test)
    return corpus
def punctuation_aug(corpus):
    augmented_sentences = []

    # go through all train and dev sentences
    for sentence in corpus.train:
        punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
        augmented_sentence: Sentence = Sentence()
        for token in sentence:
            if token.text not in punc:
                augmented_sentence.add_token(token)

                # append to augmented sentences
        if len(augmented_sentence) > 0:
            augmented_sentences.append(augmented_sentence)

    # make a new corpus with the augmented sentences
    corpus = Corpus(train=SentenceDataset(augmented_sentences),
                    dev=corpus.dev,
                    test=corpus.test)
    return corpus
Exemplo n.º 9
0
    def predict(self,
                sentences: Union[List[Sentence], Sentence],
                mini_batch_size: int = 32,
                num_workers: int = 8,
                print_tree: bool = False,
                embedding_storage_mode="none",
                ) -> None:
        """
        Predict arcs and tags for Dependency Parser task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: mini batch size to use
        :param print_tree: set to True to print dependency parser of sentence as tree shape
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """

        sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        for batch in data_loader:
            with torch.no_grad():
                score_arc, score_rel = self.forward(batch)
                arc_prediction, relation_prediction = self._obtain_labels_(score_arc, score_rel)

            for sentnce_index, (sentence, sent_tags, sent_arcs) in enumerate(zip(batch, relation_prediction, arc_prediction)):
                for token_index, (token, tag, head_id) in enumerate(zip(sentence.tokens, sent_tags, sent_arcs)):
                    token.add_tag(self.tag_type,
                                  tag,
                                  score_rel[sentnce_index][token_index])
                    
                    token.head_id = int(head_id)

                if print_tree:
                    tree_printer(sentence, self.tag_type)
                    print("-" * 50)
            store_embeddings(batch, storage_mode=embedding_storage_mode)
def ocr_aug(corpus):
    aug = nac.OcrAug(tokenizer=whitespace_tokenizer)
    # go through all train and dev sentences
    augmented_sentences = []
    for sentence in corpus.train:
        augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3)
        for augmented_text in augmented_texts:
            augmented_sentence: Sentence = Sentence()
            augmented_token_texts = augmented_text.split(" ")
            for augmented_token_text, original_token in zip(augmented_token_texts, sentence):
                # make a new token
                augmented_token = Token(augmented_token_text)
                # transfer annotations over to augmented token
                augmented_token.annotation_layers = original_token.annotation_layers
                # add augmented token to augmented sentence
                augmented_sentence.add_token(augmented_token)
            # add augmented sentence to list of all augmented sentences
            augmented_sentences.append(augmented_sentence)

    corpus = Corpus(train=SentenceDataset(augmented_sentences),
                    dev=corpus.dev,
                    test=corpus.test)

    return corpus
Exemplo n.º 11
0
    def evaluate(self,
                 sentences: Union[List[DataPoint], Dataset],
                 out_path: Union[str, Path] = None,
                 embedding_storage_mode: str = "none",
                 mini_batch_size: int = 32,
                 num_workers: int = 8,
                 main_score_type: Tuple[str, str] = ("micro avg", 'f1-score'),
                 return_predictions: bool = False) -> (Result, float):

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(sentences, Dataset):
            sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        # use scikit-learn to evaluate
        y_true = []
        y_pred = []

        with torch.no_grad():
            eval_loss = 0

            lines: List[str] = []
            batch_count: int = 0

            for batch in data_loader:
                batch_count += 1

                # remove previously predicted labels
                [sentence.remove_labels('predicted') for sentence in batch]

                # get the gold labels
                true_values_for_batch = [
                    sentence.get_labels(self.label_type) for sentence in batch
                ]

                # predict for batch
                loss = self.predict(
                    batch,
                    embedding_storage_mode=embedding_storage_mode,
                    mini_batch_size=mini_batch_size,
                    label_name='predicted',
                    return_loss=True)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]

                # get the predicted labels
                predictions = [
                    sentence.get_labels('predicted') for sentence in batch
                ]

                for sentence, prediction, true_value in zip(
                        sentences_for_batch,
                        predictions,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\n".format(sentence, true_value,
                                                      prediction)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions, true_values_for_batch):

                    true_values_for_sentence = [
                        label.value for label in true_values_for_sentence
                    ]
                    predictions_for_sentence = [
                        label.value for label in predictions_for_sentence
                    ]

                    y_true_instance = np.zeros(len(self.label_dictionary),
                                               dtype=int)
                    for i in range(len(self.label_dictionary)):
                        if self.label_dictionary.get_item_for_index(
                                i) in true_values_for_sentence:
                            y_true_instance[i] = 1
                    y_true.append(y_true_instance.tolist())

                    y_pred_instance = np.zeros(len(self.label_dictionary),
                                               dtype=int)
                    for i in range(len(self.label_dictionary)):
                        if self.label_dictionary.get_item_for_index(
                                i) in predictions_for_sentence:
                            y_pred_instance[i] = 1
                    y_pred.append(y_pred_instance.tolist())

                store_embeddings(batch, embedding_storage_mode)

            # remove predicted labels if return_predictions is False
            # Problem here: the predictions are only contained in sentences if it was chosen memory_mode="full" during
            # creation of the ClassificationDataset in the ClassificationCorpus creation. If the ClassificationCorpus has
            # memory mode "partial", then the predicted labels are not contained in sentences in any case so the following
            # optional removal has no effect. Predictions won't be accessible outside the eval routine in this case regardless
            # whether return_predictions is True or False. TODO: fix this

            if not return_predictions:
                for sentence in sentences:
                    sentence.annotation_layers['predicted'] = []

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            # make "classification report"
            target_names = []
            for i in range(len(self.label_dictionary)):
                target_names.append(
                    self.label_dictionary.get_item_for_index(i))
            classification_report = metrics.classification_report(
                y_true,
                y_pred,
                digits=4,
                target_names=target_names,
                zero_division=0)
            classification_report_dict = metrics.classification_report(
                y_true,
                y_pred,
                digits=4,
                target_names=target_names,
                zero_division=0,
                output_dict=True)

            # get scores
            micro_f_score = round(
                metrics.fbeta_score(y_true,
                                    y_pred,
                                    beta=self.beta,
                                    average='micro',
                                    zero_division=0), 4)
            accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)
            macro_f_score = round(
                metrics.fbeta_score(y_true,
                                    y_pred,
                                    beta=self.beta,
                                    average='macro',
                                    zero_division=0), 4)
            precision_score = round(
                metrics.precision_score(y_true,
                                        y_pred,
                                        average='macro',
                                        zero_division=0), 4)
            recall_score = round(
                metrics.recall_score(y_true,
                                     y_pred,
                                     average='macro',
                                     zero_division=0), 4)

            detailed_result = ("\nResults:"
                               f"\n- F-score (micro) {micro_f_score}"
                               f"\n- F-score (macro) {macro_f_score}"
                               f"\n- Accuracy {accuracy_score}"
                               '\n\nBy class:\n' + classification_report)

            # line for log file
            if not self.multi_label:
                log_header = "ACCURACY"
                log_line = f"\t{accuracy_score}"
            else:
                log_header = "PRECISION\tRECALL\tF1\tACCURACY"
                log_line = f"{precision_score}\t" \
                           f"{recall_score}\t" \
                           f"{macro_f_score}\t" \
                           f"{accuracy_score}"

            result = Result(main_score=classification_report_dict[
                main_score_type[0]][main_score_type[1]],
                            log_line=log_line,
                            log_header=log_header,
                            detailed_results=detailed_result,
                            classification_report=classification_report_dict)

            eval_loss /= batch_count

            return result, eval_loss
Exemplo n.º 12
0
    def evaluate(
        self,
        sentences: Union[List[DataPoint], Dataset],
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: int = 8,
    ) -> (Result, float):

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(sentences, Dataset):
            sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        # use scikit-learn to evaluate
        y_true = []
        y_pred = []

        with torch.no_grad():
            eval_loss = 0

            lines: List[str] = []
            batch_count: int = 0
            for batch in data_loader:

                batch_count += 1

                # remove previously predicted labels
                [sentence.remove_labels('predicted') for sentence in batch]

                # get the gold labels
                true_values_for_batch = [
                    sentence.get_labels(self.label_type) for sentence in batch
                ]

                # predict for batch
                loss = self.predict(
                    batch,
                    embedding_storage_mode=embedding_storage_mode,
                    mini_batch_size=mini_batch_size,
                    label_name='predicted',
                    return_loss=True)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]

                # get the predicted labels
                predictions = [
                    sentence.get_labels('predicted') for sentence in batch
                ]

                for sentence, prediction, true_value in zip(
                        sentences_for_batch,
                        predictions,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\n".format(sentence, true_value,
                                                      prediction)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions, true_values_for_batch):

                    true_values_for_sentence = [
                        label.value for label in true_values_for_sentence
                    ]
                    predictions_for_sentence = [
                        label.value for label in predictions_for_sentence
                    ]

                    y_true_instance = np.zeros(len(self.label_dictionary),
                                               dtype=int)
                    for i in range(len(self.label_dictionary)):
                        if self.label_dictionary.get_item_for_index(
                                i) in true_values_for_sentence:
                            y_true_instance[i] = 1
                    y_true.append(y_true_instance.tolist())

                    y_pred_instance = np.zeros(len(self.label_dictionary),
                                               dtype=int)
                    for i in range(len(self.label_dictionary)):
                        if self.label_dictionary.get_item_for_index(
                                i) in predictions_for_sentence:
                            y_pred_instance[i] = 1
                    y_pred.append(y_pred_instance.tolist())

                store_embeddings(batch, embedding_storage_mode)

            # remove predicted labels
            for sentence in sentences:
                sentence.annotation_layers['predicted'] = []

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            # make "classification report"
            target_names = []
            for i in range(len(self.label_dictionary)):
                target_names.append(
                    self.label_dictionary.get_item_for_index(i))
            classification_report = metrics.classification_report(
                y_true,
                y_pred,
                digits=4,
                target_names=target_names,
                zero_division=0)

            # get scores
            micro_f_score = round(
                metrics.fbeta_score(y_true,
                                    y_pred,
                                    beta=self.beta,
                                    average='micro',
                                    zero_division=0), 4)
            accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)
            macro_f_score = round(
                metrics.fbeta_score(y_true,
                                    y_pred,
                                    beta=self.beta,
                                    average='macro',
                                    zero_division=0), 4)
            precision_score = round(
                metrics.precision_score(y_true,
                                        y_pred,
                                        average='macro',
                                        zero_division=0), 4)
            recall_score = round(
                metrics.recall_score(y_true,
                                     y_pred,
                                     average='macro',
                                     zero_division=0), 4)

            detailed_result = ("\nResults:"
                               f"\n- F-score (micro) {micro_f_score}"
                               f"\n- F-score (macro) {macro_f_score}"
                               f"\n- Accuracy {accuracy_score}"
                               '\n\nBy class:\n' + classification_report)

            # line for log file
            if not self.multi_label:
                log_header = "ACCURACY"
                log_line = f"\t{accuracy_score}"
            else:
                log_header = "PRECISION\tRECALL\tF1\tACCURACY"
                log_line = f"{precision_score}\t" \
                           f"{recall_score}\t" \
                           f"{macro_f_score}\t" \
                           f"{accuracy_score}"

            result = Result(
                main_score=micro_f_score,
                log_line=log_line,
                log_header=log_header,
                detailed_results=detailed_result,
            )

            eval_loss /= batch_count

            return result, eval_loss
        tag = tkn.get_tag("ner").value
        pref, tag_no_pref = _split_tag(tag)
        if tag_no_pref is None:
            break
        tag_no_pref_encoded = tag_no_pref.encode("utf-8")
        if tag_no_pref_encoded in tag_dictionary_no_prefix.idx2item and tag_countdown[
                tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] > 0:
            corpus_sents.append(sent)
            tag_countdown[
                tag_dictionary_no_prefix.item2idx[tag_no_pref_encoded]] -= 1
            sent_picked = True

print("sents for training: " + str(len(corpus_sents)))
print("amount of items in dict: " + str(len(tag_dictionary.item2idx)))

training_dataset = SentenceDataset(corpus_sents)
training_corpus = Corpus(train=training_dataset,
                         dev=corpus_small.dev,
                         test=corpus_small.test,
                         sample_missing_splits=False)
trainer = ModelTrainer(tagger, training_corpus, optimizer=torch.optim.AdamW)
tag_dictionary = training_corpus.make_label_dictionary(tag_type)
tagger.add_and_switch_to_new_task("fewshot-moviecomplex-simple-to-conll3",
                                  tag_dictionary=tag_dictionary,
                                  tag_type=tag_type)
trainer.train(
    base_path='resources/v3/fewshot-moviecomplex-simple-to-conll3-k' + str(k),
    learning_rate=5.0e-5,
    mini_batch_size=32,
    mini_batch_chunk_size=None,
    max_epochs=10,
Exemplo n.º 14
0
    def evaluate(self,
                 sentences: Union[List[DataPoint], Dataset],
                 out_path: Union[str, Path] = None,
                 embedding_storage_mode: str = "none",
                 mini_batch_size: int = 32,
                 num_workers: int = 8,
                 **kwargs) -> (Result, float):

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(sentences, Dataset):
            sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        with torch.no_grad():
            eval_loss = 0

            metric = MetricRegression("Evaluation")

            lines: List[str] = []
            total_count = 0
            for batch_nr, batch in enumerate(data_loader):

                if isinstance(batch, Sentence):
                    batch = [batch]

                scores, loss = self.forward_labels_and_loss(batch)

                true_values = []
                for sentence in batch:
                    total_count += 1
                    for label in sentence.labels:
                        true_values.append(float(label.value))

                results = []
                for score in scores:
                    if type(score[0]) is Label:
                        results.append(float(score[0].score))
                    else:
                        results.append(float(score[0]))

                eval_loss += loss

                metric.true.extend(true_values)
                metric.pred.extend(results)

                for sentence, prediction, true_value in zip(
                        batch, results, true_values):
                    eval_line = "{}\t{}\t{}\n".format(
                        sentence.to_original_text(), true_value, prediction)
                    lines.append(eval_line)

                store_embeddings(batch, embedding_storage_mode)

            eval_loss /= total_count

            ##TODO: not saving lines yet
            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}"
            log_header = "MSE\tSPEARMAN\tPEARSON"

            detailed_result = (
                f"AVG: mse: {metric.mean_squared_error():.4f} - "
                f"mae: {metric.mean_absolute_error():.4f} - "
                f"pearson: {metric.pearsonr():.4f} - "
                f"spearman: {metric.spearmanr():.4f}")

            result: Result = Result(
                main_score=metric.pearsonr(),
                loss=eval_loss,
                log_header=log_header,
                log_line=log_line,
                detailed_results=detailed_result,
            )

            return result
Exemplo n.º 15
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size: int = 32,
        multi_class_prob: bool = False,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
    ):
        """
        Predicts the class labels for the given sentences. The labels are directly added to the sentences.
        :param sentences: list of sentences
        :param mini_batch_size: mini batch size to use
        :param multi_class_prob : return probability for all class for multiclass
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name == None:
            label_name = self.label_type if self.label_type is not None else 'label'

        with torch.no_grad():
            if not sentences:
                return sentences

            if isinstance(sentences, DataPoint):
                sentences = [sentences]

            # filter empty sentences
            if isinstance(sentences[0], Sentence):
                sentences = [
                    sentence for sentence in sentences if len(sentence) > 0
                ]
            if len(sentences) == 0: return sentences

            # reverse sort all sequences by their length
            rev_order_len_index = sorted(range(len(sentences)),
                                         key=lambda k: len(sentences[k]),
                                         reverse=True)

            reordered_sentences: List[Union[DataPoint, str]] = [
                sentences[index] for index in rev_order_len_index
            ]

            dataloader = DataLoader(
                dataset=SentenceDataset(reordered_sentences),
                batch_size=mini_batch_size)
            # progress bar for verbosity
            if verbose:
                dataloader = tqdm(dataloader)

            overall_loss = 0
            batch_no = 0
            for batch in dataloader:

                batch_no += 1

                if verbose:
                    dataloader.set_description(
                        f"Inferencing on batch {batch_no}")

                # stop if all sentences are empty
                if not batch:
                    continue

                scores = self.forward(batch)

                if return_loss:
                    overall_loss += self._calculate_loss(scores, batch)

                predicted_labels = self._obtain_labels(
                    scores, predict_prob=multi_class_prob)

                for (sentence, labels) in zip(batch, predicted_labels):
                    for label in labels:
                        if self.multi_label or multi_class_prob:
                            sentence.add_label(label_name, label.value,
                                               label.score)
                        else:
                            sentence.set_label(label_name, label.value,
                                               label.score)

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

            if return_loss:
                return overall_loss / batch_no
Exemplo n.º 16
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size=32,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
        label_threshold: float = 0.5,
        multi_label: Optional[bool] = None,
    ):
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if not label_name:
            label_name = self.get_current_label_type()

        if multi_label is None:
            multi_label = self.is_current_task_multi_label()

        # with torch.no_grad():
        if not sentences:
            return sentences

        if isinstance(sentences, Sentence):
            sentences = [sentences]

        # set context if not set already
        previous_sentence = None
        for sentence in sentences:
            if sentence.is_context_set(): continue
            sentence._previous_sentence = previous_sentence
            sentence._next_sentence = None
            if previous_sentence: previous_sentence._next_sentence = sentence
            previous_sentence = sentence

        # reverse sort all sequences by their length
        rev_order_len_index = sorted(range(len(sentences)),
                                     key=lambda k: len(sentences[k]),
                                     reverse=True)

        reordered_sentences: List[Union[Sentence, str]] = [
            sentences[index] for index in rev_order_len_index
        ]

        dataloader = DataLoader(dataset=SentenceDataset(reordered_sentences),
                                batch_size=mini_batch_size)

        # progress bar for verbosity
        if verbose:
            dataloader = tqdm(dataloader)

        overall_loss = 0
        overall_count = 0
        batch_no = 0
        with torch.no_grad():
            for batch in dataloader:

                batch_no += 1

                if verbose:
                    dataloader.set_description(
                        f"Inferencing on batch {batch_no}")

                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                # go through each sentence in the batch
                for sentence in batch:

                    # always remove tags first
                    sentence.remove_labels(label_name)

                    all_labels = [
                        label.decode("utf-8") for label in
                        self.get_current_label_dictionary().idx2item
                    ]

                    best_label = None
                    for label in all_labels:
                        tars_sentence = self._get_tars_formatted_sentence(
                            label, sentence)

                        loss_and_count = self.tars_model.predict(
                            tars_sentence,
                            label_name=label_name,
                            return_loss=True,
                            return_probabilities_for_all_classes=True
                            if label_threshold < 0.5 else False,
                        )

                        overall_loss += loss_and_count[0].item()
                        overall_count += loss_and_count[1]

                        # add all labels that according to TARS match the text and are above threshold
                        for predicted_tars_label in tars_sentence.get_labels(
                                label_name):
                            if predicted_tars_label.value == self.LABEL_MATCH \
                                    and predicted_tars_label.score > label_threshold:
                                # do not add labels below confidence threshold
                                sentence.add_label(label_name, label,
                                                   predicted_tars_label.score)

                    # only use label with highest confidence if enforcing single-label predictions
                    if not multi_label:
                        if len(sentence.get_labels(label_name)) > 0:
                            # get all label scores and do an argmax to get the best label
                            label_scores = torch.tensor([
                                label.score
                                for label in sentence.get_labels(label_name)
                            ],
                                                        dtype=torch.float)
                            best_label = sentence.get_labels(label_name)[
                                torch.argmax(label_scores)]

                            # remove previously added labels and only add the best label
                            sentence.remove_labels(label_name)
                            sentence.add_label(typename=label_name,
                                               value=best_label.value,
                                               score=best_label.score)

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

        if return_loss:
            return overall_loss, overall_count
Exemplo n.º 17
0
    def evaluate(
            self,
            sentences: Union[List[Sentence], Dataset],
            out_path: Union[str, Path] = None,
            embedding_storage_mode: str = "none",
            mini_batch_size: int = 32,
            num_workers: int = 8,
    ) -> (Result, float):

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(sentences, Dataset):
            sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers)

        # if span F1 needs to be used, use separate eval method
        if self._requires_span_F1_evaluation():
            return self._evaluate_with_span_F1(data_loader, embedding_storage_mode, mini_batch_size, out_path)

        # else, use scikit-learn to evaluate
        y_true = []
        y_pred = []
        labels = Dictionary(add_unk=False)

        eval_loss = 0
        batch_no: int = 0

        lines: List[str] = []

        for batch in data_loader:

            # predict for batch
            loss = self.predict(batch,
                                embedding_storage_mode=embedding_storage_mode,
                                mini_batch_size=mini_batch_size,
                                label_name='predicted',
                                return_loss=True)
            eval_loss += loss
            batch_no += 1

            for sentence in batch:

                for token in sentence:
                    # add gold tag
                    gold_tag = token.get_tag(self.tag_type).value
                    y_true.append(labels.add_item(gold_tag))

                    # add predicted tag
                    predicted_tag = token.get_tag('predicted').value
                    y_pred.append(labels.add_item(predicted_tag))

                    # for file output
                    lines.append(f'{token.text} {gold_tag} {predicted_tag}\n')

                lines.append('\n')

        if out_path:
            with open(Path(out_path), "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        eval_loss /= batch_no

        # use sklearn
        from sklearn import metrics

        # make "classification report"
        target_names = []
        for i in range(len(labels)):
            target_names.append(labels.get_item_for_index(i))
        classification_report = metrics.classification_report(y_true, y_pred, digits=4, target_names=target_names,
                                                              zero_division=1)

        # get scores
        macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro'), 4)
        micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro'), 4)
        accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)

        detailed_result = (
                "\nResults:"
                f"\n- F-score (micro) {macro_f_score}"
                f"\n- F-score (macro) {micro_f_score}"
                f"\n- Accuracy {accuracy_score}"
                '\n\nBy class:\n' + classification_report
        )

        # line for log file
        log_header = "ACCURACY"
        log_line = f"\t{accuracy_score}"

        result = Result(
            main_score=macro_f_score,
            log_line=log_line,
            log_header=log_header,
            detailed_results=detailed_result,
        )
        return result, eval_loss
Exemplo n.º 18
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size: int = 32,
        return_probabilities_for_all_classes: bool = False,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
    ):
        """
        Predicts the class labels for the given sentences. The labels are directly added to the sentences.
        :param sentences: list of sentences
        :param mini_batch_size: mini batch size to use
        :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name is None:
            label_name = self.label_type if self.label_type is not None else "label"

        with torch.no_grad():
            if not sentences:
                return sentences

            if isinstance(sentences, DataPoint):
                sentences = [sentences]

            # filter empty sentences
            if isinstance(sentences[0], DataPoint):
                sentences = [
                    sentence for sentence in sentences if len(sentence) > 0
                ]
            if len(sentences) == 0:
                return sentences

            # reverse sort all sequences by their length
            rev_order_len_index = sorted(range(len(sentences)),
                                         key=lambda k: len(sentences[k]),
                                         reverse=True)

            reordered_sentences: List[Union[DataPoint, str]] = [
                sentences[index] for index in rev_order_len_index
            ]

            dataloader = DataLoader(
                dataset=SentenceDataset(reordered_sentences),
                batch_size=mini_batch_size)
            # progress bar for verbosity
            if verbose:
                dataloader = tqdm(dataloader)

            overall_loss = 0
            batch_no = 0
            label_count = 0
            for batch in dataloader:

                batch_no += 1

                if verbose:
                    dataloader.set_description(
                        f"Inferencing on batch {batch_no}")

                # stop if all sentences are empty
                if not batch:
                    continue

                scores, gold_labels, data_points, label_candidates = self.forward_pass(
                    batch, return_label_candidates=True)
                # remove previously predicted labels of this type
                for sentence in data_points:
                    sentence.remove_labels(label_name)

                if return_loss:
                    overall_loss += self._calculate_loss(scores,
                                                         gold_labels)[0]
                    label_count += len(label_candidates)

                # if anything could possibly be predicted
                if len(label_candidates) > 0:
                    if self.multi_label:
                        sigmoided = torch.sigmoid(
                            scores)  # size: (n_sentences, n_classes)
                        n_labels = sigmoided.size(1)
                        for s_idx, (data_point, label_candidate) in enumerate(
                                zip(data_points, label_candidates)):
                            for l_idx in range(n_labels):
                                label_value = self.label_dictionary.get_item_for_index(
                                    l_idx)
                                if label_value == 'O': continue
                                label_threshold = self._get_label_threshold(
                                    label_value)
                                label_score = sigmoided[s_idx, l_idx].item()
                                if label_score > label_threshold or return_probabilities_for_all_classes:
                                    label = label_candidate.spawn(
                                        value=label_value, score=label_score)
                                    data_point.add_complex_label(
                                        label_name, label)
                    else:
                        softmax = torch.nn.functional.softmax(scores, dim=-1)

                        if return_probabilities_for_all_classes:
                            n_labels = softmax.size(1)
                            for s_idx, (data_point,
                                        label_candidate) in enumerate(
                                            zip(data_points,
                                                label_candidates)):
                                for l_idx in range(n_labels):
                                    label_value = self.label_dictionary.get_item_for_index(
                                        l_idx)
                                    if label_value == 'O': continue
                                    label_score = softmax[s_idx, l_idx].item()
                                    label = label_candidate.spawn(
                                        value=label_value, score=label_score)
                                    data_point.add_complex_label(
                                        label_name, label)
                        else:
                            conf, idx = torch.max(softmax, dim=-1)
                            for data_point, label_candidate, c, i in zip(
                                    data_points, label_candidates, conf, idx):
                                label_value = self.label_dictionary.get_item_for_index(
                                    i.item())
                                if label_value == 'O': continue
                                label = label_candidate.spawn(
                                    value=label_value, score=c.item())
                                data_point.add_complex_label(label_name, label)

                store_embeddings(batch, storage_mode=embedding_storage_mode)

            if return_loss:
                return overall_loss, label_count
Exemplo n.º 19
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence, List[str], str],
        mini_batch_size: int = 32,
        embedding_storage_mode="none",
        multi_class_prob: bool = False,
        verbose: bool = False,
        use_tokenizer: Union[bool, Callable[[str], List[Token]]] = space_tokenizer,
    ) -> List[Sentence]:
        """
        Predicts the class labels for the given sentences. The labels are directly added to the sentences.
        :param sentences: list of sentences
        :param mini_batch_size: mini batch size to use
        :param embedding_storage_mode: 'none' for the minimum memory footprint, 'cpu' to store embeddings in Ram,
        'gpu' to store embeddings in GPU memory.
        :param multi_class_prob : return probability for all class for multiclass
        :param verbose: set to True to display a progress bar
        :param use_tokenizer: a custom tokenizer when string are provided (default is space based tokenizer).
        :return: the list of sentences containing the labels
        """
        with torch.no_grad():
            if not sentences:
                return sentences

            if isinstance(sentences, Sentence) or isinstance(sentences, str):
                sentences = [sentences]

            if (flair.device.type == "cuda") and embedding_storage_mode == "cpu":
                log.warning(
                    "You are inferring on GPU with parameter 'embedding_storage_mode' set to 'cpu'."
                    "This option will slow down your inference, usually 'none' (default value) "
                    "is a better choice."
                )

            # reverse sort all sequences by their length
            rev_order_len_index = sorted(
                range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True
            )
            original_order_index = sorted(
                range(len(rev_order_len_index)), key=lambda k: rev_order_len_index[k]
            )

            reordered_sentences: List[Union[Sentence, str]] = [
                sentences[index] for index in rev_order_len_index
            ]

            if isinstance(sentences[0], Sentence):
                # remove previous embeddings
                store_embeddings(reordered_sentences, "none")
                dataset = SentenceDataset(reordered_sentences)
            else:
                dataset = StringDataset(
                    reordered_sentences, use_tokenizer=use_tokenizer
                )
            dataloader = DataLoader(
                dataset=dataset, batch_size=mini_batch_size, collate_fn=lambda x: x
            )

            # progress bar for verbosity
            if verbose:
                dataloader = tqdm(dataloader)

            results: List[Sentence] = []
            for i, batch in enumerate(dataloader):
                if verbose:
                    dataloader.set_description(f"Inferencing on batch {i}")
                results += batch
                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                scores = self.forward(batch)
                predicted_labels = self._obtain_labels(
                    scores, predict_prob=multi_class_prob
                )

                for (sentence, labels) in zip(batch, predicted_labels):
                    sentence.labels = labels

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

            results: List[Union[Sentence, str]] = [
                results[index] for index in original_order_index
            ]
            assert len(sentences) == len(results)
            return results
Exemplo n.º 20
0
money = "money"
tech = "tech"

# training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink")
train = SentenceDataset([
    Sentence('Are You Trading or Gambling?').add_label(label_name, finance),
    Sentence('Amazon capitalization reached trillion dollars').add_label(
        label_name, finance),
    Sentence('Finance dictionary: SPACs and IPOs').add_label(
        label_name, finance),
    Sentence('Developer salaries development').add_label(label_name, money),
    Sentence('My annual income as developer since 2008').add_label(
        label_name, money),
    Sentence('How to maximize your income as a developer').add_label(
        label_name, money),
    Sentence('Levels.fyi salary information in tech').add_label(
        label_name, money),
    Sentence('New version of ruby').add_label(label_name, tech),
    Sentence('Python: 30 years in').add_label(label_name, tech),
    Sentence(
        'Things I learned developing D3 library for visualization').add_label(
            label_name, tech),
    Sentence('Bitcoin price most volatile since 2019').add_label(
        label_name, crypto),
    Sentence('Cryptocurrency mining consumes as much energy as some countries'
             ).add_label(label_name, crypto),
    Sentence('Bitcoin is a scam').add_label(label_name, crypto),
])

# test dataset consisting of two sentences (1 labeled as "food" and 1 labeled as "drink")
test = SentenceDataset([
    Sentence('Coinbase S-1 filing').add_label(label_name, finance),
Exemplo n.º 21
0
    def evaluate(
        self,
        sentences: Union[List[Sentence], Dataset],
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: int = 8,
        wsd_evaluation: bool = False,
        **kwargs,
    ) -> (Result, float):

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(sentences, Dataset):
            sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        eval_loss = 0
        eval_count = 0

        batch_no: int = 0

        metric = Metric("Evaluation", beta=self.beta)

        lines: List[str] = []

        y_true = []
        y_pred = []

        for batch in data_loader:

            # predict for batch
            loss_and_count = self.predict(
                batch,
                embedding_storage_mode=embedding_storage_mode,
                mini_batch_size=mini_batch_size,
                label_name='predicted',
                return_loss=True)

            eval_loss += loss_and_count[0]
            eval_count += loss_and_count[1]
            batch_no += 1

            for sentence in batch:

                # make list of gold tags
                gold_spans = sentence.get_spans(self.get_current_tag_type())
                gold_tags = [(span.tag, repr(span)) for span in gold_spans]

                # make list of predicted tags
                predicted_spans = sentence.get_spans("predicted")
                predicted_tags = [(span.tag, repr(span))
                                  for span in predicted_spans]

                # check for true positives, false positives and false negatives
                for tag, prediction in predicted_tags:
                    if (tag, prediction) in gold_tags:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag, gold in gold_tags:
                    if (tag, gold) not in predicted_tags:
                        metric.add_fn(tag)

                tags_gold = []
                tags_pred = []

                # also write to file in BIO format to use old conlleval script
                if out_path:
                    for token in sentence:
                        # check if in gold spans
                        gold_tag = 'O'
                        for span in gold_spans:
                            if token in span:
                                gold_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_gold.append(gold_tag)

                        predicted_tag = 'O'
                        # check if in predicted spans
                        for span in predicted_spans:
                            if token in span:
                                predicted_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_pred.append(predicted_tag)

                        lines.append(
                            f'{token.text} {gold_tag} {predicted_tag}\n')
                    lines.append('\n')

                y_true.append(tags_gold)
                y_pred.append(tags_pred)

        if out_path:
            with open(Path(out_path), "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        detailed_result = (
            "\nResults:"
            f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}"
            f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}"
            '\n\nBy class:')

        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss / eval_count
Exemplo n.º 22
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size=32,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
    ):
        # return
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name == None:
            label_name = self.get_current_tag_type()

        # with torch.no_grad():
        if not sentences:
            return sentences

        if isinstance(sentences, Sentence):
            sentences = [sentences]

        # set context if not set already
        previous_sentence = None
        for sentence in sentences:
            if sentence.is_context_set(): continue
            sentence._previous_sentence = previous_sentence
            sentence._next_sentence = None
            if previous_sentence: previous_sentence._next_sentence = sentence
            previous_sentence = sentence

        # reverse sort all sequences by their length
        rev_order_len_index = sorted(range(len(sentences)),
                                     key=lambda k: len(sentences[k]),
                                     reverse=True)

        reordered_sentences: List[Union[Sentence, str]] = [
            sentences[index] for index in rev_order_len_index
        ]

        dataloader = DataLoader(dataset=SentenceDataset(reordered_sentences),
                                batch_size=mini_batch_size)

        # progress bar for verbosity
        if verbose:
            dataloader = tqdm(dataloader)

        overall_loss = 0
        overall_count = 0
        batch_no = 0
        with torch.no_grad():
            for batch in dataloader:

                batch_no += 1

                if verbose:
                    dataloader.set_description(
                        f"Inferencing on batch {batch_no}")

                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                # go through each sentence in the batch
                for sentence in batch:

                    # always remove tags first
                    for token in sentence:
                        token.remove_labels(label_name)

                    all_labels = [
                        label.decode("utf-8")
                        for label in self.get_current_tag_dictionary().idx2item
                    ]

                    all_detected = {}
                    for label in all_labels:
                        tars_sentence = self._get_tars_formatted_sentence(
                            label, sentence)

                        label_length = 0 if not self.prefix else len(
                            label.split(" ")) + len(self.separator.split(" "))

                        loss_and_count = self.tars_model.predict(
                            tars_sentence,
                            label_name=label_name,
                            all_tag_prob=True,
                            return_loss=True)
                        overall_loss += loss_and_count[0].item()
                        overall_count += loss_and_count[1]

                        for span in tars_sentence.get_spans(label_name):
                            span.set_label('tars_temp_label', label)
                            all_detected[span] = span.score

                        for span in tars_sentence.get_spans(label_name):
                            for token in span:
                                corresponding_token = sentence.get_token(
                                    token.idx - label_length)
                                if corresponding_token is None: continue
                                if corresponding_token.get_tag(label_name).value != '' and \
                                        corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score:
                                    continue
                                corresponding_token.add_tag(
                                    label_name,
                                    token.get_tag(label_name).value + label,
                                    token.get_tag(label_name).score,
                                )

                    # import operator
                    # sorted_x = sorted(all_detected.items(), key=operator.itemgetter(1))
                    # sorted_x.reverse()
                    # print(sorted_x)
                    # for tuple in sorted_x:
                    #     span = tuple[0]
                    #
                    #     tag_this = True
                    #
                    # for token in span:
                    #     corresponding_token = sentence.get_token(token.idx)
                    #     if corresponding_token is None:
                    #         tag_this = False
                    #         continue
                    #     if corresponding_token.get_tag(label_name).value != '' and \
                    #             corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score:
                    #         tag_this = False
                    #         continue
                    #
                    # if tag_this:
                    #     for token in span:
                    #         corresponding_token = sentence.get_token(token.idx)
                    #         corresponding_token.add_tag(
                    #             label_name,
                    #             token.get_tag(label_name).value + span.get_labels('tars_temp_label')[0].value,
                    #             token.get_tag(label_name).score,
                    #         )

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

        if return_loss:
            return overall_loss, overall_count
Exemplo n.º 23
0
    def evaluate(
            self,
            data_points: Union[List[DataPoint], Dataset],
            gold_label_type: str,
            out_path: Union[str, Path] = None,
            embedding_storage_mode: str = "none",
            mini_batch_size: int = 32,
            num_workers: int = 8,
            main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
            gold_label_dictionary: Optional[Dictionary] = None,
        ) -> Result:
        
        if not isinstance(data_points, Dataset):
            data_points = SentenceDataset(data_points)
        data_loader = DataLoader(data_points,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        lines: List[str] = ["token gold_tag gold_arc predicted_tag predicted_arc\n"]

        average_over = 0
        eval_loss_arc = 0
        eval_loss_rel = 0

        y_true = []
        y_pred = []

        parsing_metric = ParsingMetric()

        for batch in data_loader:
            average_over += 1
            with torch.no_grad():
                score_arc, score_rel = self.forward(batch)
                loss_arc, loss_rel = self._calculate_loss(score_arc, score_rel, batch)
                arc_prediction, relation_prediction = self._obtain_labels_(score_arc, score_rel)
                
            parsing_metric(arc_prediction, relation_prediction, batch, gold_label_type)
            
            eval_loss_arc += loss_arc
            eval_loss_rel += loss_rel

            for (sentence, arcs, sent_tags) in zip(batch, arc_prediction, relation_prediction):
                for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags):
                    token: Token = token
                    token.add_tag_label("predicted", Label(tag))
                    token.add_tag_label("predicted_head_id",
                                        Label(str(int(arc))))

                    # append both to file for evaluation
                    eval_line = "{} {} {} {} {}\n".format(token.text,
                                                          token.get_tag(gold_label_type).value,
                                                          str(token.head_id),
                                                          tag,
                                                          str(int(arc)))
                    lines.append(eval_line)
                lines.append("\n")

            for sentence in batch:
                
                gold_tags = [token.get_tag(gold_label_type).value for token in sentence.tokens]
                predicted_tags = [tag.tag for tag in sentence.get_spans("predicted")]

                y_pred += [self.relations_dictionary.get_idx_for_item(tag)
                           for tag in predicted_tags]
                y_true += [self.relations_dictionary.get_idx_for_item(tag)
                           for tag in gold_tags]

            store_embeddings(batch, embedding_storage_mode)

        eval_loss_arc /= average_over
        eval_loss_rel /= average_over

        if out_path is not None:
            with open(out_path, "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        classification_report_dict = sklearn.metrics.classification_report(y_true,
                                                                           y_pred,
                                                                           target_names=self.relations_dictionary.idx2item,
                                                                           zero_division=0,
                                                                           output_dict=True,
                                                                           labels=range(len(self.relations_dictionary)))

        accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4)

        precision_score = round(classification_report_dict["micro avg"]["precision"], 4)
        recall_score = round(classification_report_dict["micro avg"]["recall"], 4)
        micro_f_score = round(classification_report_dict["micro avg"]["f1-score"], 4)
        macro_f_score = round(classification_report_dict["macro avg"]["f1-score"], 4)

        main_score = classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]]

        detailed_result = (
            f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}"
            f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}"
            f"\nF-Score: micro : {micro_f_score} - macro : {macro_f_score}"
            f"\n Accuracy: {accuracy_score} - Precision {precision_score} - Recall {recall_score}"
        )
        log_header = "PRECISION\tRECALL\tF1\tACCURACY"
        log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}"

        result = Result(
            main_score=main_score,
            log_line=log_line,
            log_header=log_header,
            detailed_results=detailed_result,
            classification_report=classification_report_dict,
            loss=eval_loss_rel+eval_loss_arc
        )
        return result
Exemplo n.º 24
0
train = SentenceDataset(
    [

        Sentence('email').add_label('contact_type', 'email'),
        Sentence('21 Jan: email client about signing them up for phrase 2 of Project Alpha').add_label('contact_type', 'email'),
        Sentence('Project Alpha: email client about signing them up for phase 2').add_label('contact_type', 'email'),
        Sentence('emailed client').add_label('contact_type', 'email'),
        Sentence('sent an email to the client about the project').add_label('contact_type', 'email'),
        Sentence('sent email to the client').add_label('contact_type', 'email'),
        Sentence('e-mailed to the client').add_label('contact_type', 'email'),
        Sentence('e-mailing to the client').add_label('contact_type', 'email'),
        Sentence('e-mail to the client').add_label('contact_type', 'email'),
        Sentence('sent email to the client about the new offer').add_label('contact_type', 'email'),
        Sentence('as I planned yesterday I emailed client').add_label('contact_type', 'email'),
        Sentence('emailing recent discussions').add_label('contact_type', 'email'),
        Sentence('today(project alpha) emailing recent discussions').add_label('contact_type', 'email'),
        Sentence('emailed client to schedule a meeting next week').add_label('contact_type', 'email'),
        Sentence('sent an email to client to schedule a meeting next week').add_label('contact_type', 'email'),
        Sentence('sent an email to client to set up a skype call').add_label('contact_type', 'email'),

        Sentence('21 Jan: call with client about signing them up for phase 2 of Project').add_label('contact_type', 'call'),
        Sentence('21 Jan: phone call with client about signing them up for phase 2 of Project').add_label('contact_type', 'call'),
        Sentence('21 Jan: skype with client about signing them up for phase 2 of Project').add_label('contact_type', 'call'),
        Sentence('Project Alpha: call with client about signing them up for phase 2').add_label('contact_type', 'call'),
        Sentence('Project Alpha: phone call with client about signing them up for phase 2').add_label('contact_type', 'call'),
        Sentence('Project Alpha: skype with client about signing them up for phase 2').add_label('contact_type', 'call'),
        Sentence('Phoned client about the phase 2 of the Project').add_label('contact_type', 'call'),
        Sentence('about the phase 2 of the Project, I called client').add_label('contact_type', 'call'),
        Sentence('phoning client about the project').add_label('contact_type', 'call'),
        Sentence('call with client').add_label('contact_type', 'call'),
        Sentence('calling').add_label('contact_type', 'call'),
        Sentence('skype call').add_label('contact_type', 'call'),
        Sentence('skype video call with Mary to discuss').add_label('contact_type', 'call'),
        Sentence('skyped client').add_label('contact_type', 'call'),
        Sentence('called client to inform them').add_label('contact_type', 'call'),
        Sentence('have a call about the project').add_label('contact_type', 'call'),
        Sentence('give a call').add_label('contact_type', 'call'),
        Sentence('telephone call with client about the project').add_label('contact_type', 'call'),
        Sentence('today: telephone call with client about the project').add_label('contact_type', 'call'),
        Sentence('18 December: telephone call with client about the project').add_label('contact_type', 'call'),
        Sentence('December 18th: telephone call with client about the project').add_label('contact_type', 'call'),
        Sentence('called client to schedule a meeting next week').add_label('contact_type', 'call'),
        Sentence('called client to set up a meeting next week').add_label('contact_type', 'call'),

        Sentence('21 Jan: meeting with client about signing them up for phrase 2 of Project Alpha').add_label('contact_type', 'meeting'),
        Sentence('21 Jan: meet with client about signing them up for phrase 2 of Project Alpha').add_label('contact_type', 'meeting'),
        Sentence('Project Alpha: meeting with client about signing them up for phase 2').add_label('contact_type', 'meeting'),
        Sentence('Project Alpha: meeting with client about signing them up for phase 2').add_label('contact_type', 'meeting'),
        Sentence('meet up with them').add_label('contact_type', 'meeting'),
        Sentence('meeting with client').add_label('contact_type', 'meeting'),
        Sentence('met client to discuss project').add_label('contact_type', 'meeting'),
        Sentence('meet with client at their office to review project').add_label('contact_type', 'meeting'),
        Sentence('meet').add_label('contact_type', 'meeting'),
        Sentence('set up a meeting').add_label('contact_type', 'meeting'),
        Sentence('joined a meeting').add_label('contact_type', 'meeting'),
        Sentence('participate in a meeting').add_label('contact_type', 'meeting'),
        Sentence('represent client at a meeting').add_label('contact_type', 'meeting'),
        Sentence('10 October: represent client at a meeting').add_label('contact_type', 'meeting'),
        Sentence('October 10th: represent client at a meeting').add_label('contact_type', 'meeting'),
        Sentence('met with client and decided to discuss it later over a call').add_label('contact_type', 'meeting'),
        Sentence('met with client and agreed to continue over email').add_label('contact_type', 'meeting'),

    ])
Exemplo n.º 25
0
    def evaluate(
        self,
        data_points: Union[List[DataPoint], Dataset],
        gold_label_type: str,
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: int = 8,
        main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
        exclude_labels: List[str] = [],
        gold_label_dictionary: Optional[Dictionary] = None,
    ) -> Result:
        import numpy as np
        import sklearn

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(data_points, Dataset):
            data_points = SentenceDataset(data_points)
        data_loader = DataLoader(data_points,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        with torch.no_grad():

            # loss calculation
            eval_loss = 0
            average_over = 0

            # variables for printing
            lines: List[str] = []

            # variables for computing scores
            all_spans: List[str] = []
            all_true_values = {}
            all_predicted_values = {}

            sentence_id = 0
            for batch in data_loader:

                # remove any previously predicted labels
                for datapoint in batch:
                    datapoint.remove_labels('predicted')

                # predict for batch
                loss_and_count = self.predict(
                    batch,
                    embedding_storage_mode=embedding_storage_mode,
                    mini_batch_size=mini_batch_size,
                    label_name='predicted',
                    return_loss=True)

                if isinstance(loss_and_count, Tuple):
                    average_over += loss_and_count[1]
                    eval_loss += loss_and_count[0]
                else:
                    eval_loss += loss_and_count

                # get the gold labels
                for datapoint in batch:

                    for gold_label in datapoint.get_labels(gold_label_type):
                        representation = str(
                            sentence_id) + ': ' + gold_label.identifier

                        value = gold_label.value
                        if gold_label_dictionary and gold_label_dictionary.get_idx_for_item(
                                value) == 0:
                            value = '<unk>'

                        if representation not in all_true_values:
                            all_true_values[representation] = [value]
                        else:
                            all_true_values[representation].append(value)

                        if representation not in all_spans:
                            all_spans.append(representation)

                    for predicted_span in datapoint.get_labels("predicted"):
                        representation = str(
                            sentence_id) + ': ' + predicted_span.identifier

                        # add to all_predicted_values
                        if representation not in all_predicted_values:
                            all_predicted_values[representation] = [
                                predicted_span.value
                            ]
                        else:
                            all_predicted_values[representation].append(
                                predicted_span.value)

                        if representation not in all_spans:
                            all_spans.append(representation)

                    sentence_id += 1

                store_embeddings(batch, embedding_storage_mode)

                # make printout lines
                if out_path:
                    lines.extend(
                        self._print_predictions(batch, gold_label_type))

            # write all_predicted_values to out_file if set
            if out_path:
                with open(Path(out_path), "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            # make the evaluation dictionary
            evaluation_label_dictionary = Dictionary(add_unk=False)
            evaluation_label_dictionary.add_item("O")
            for true_values in all_true_values.values():
                for label in true_values:
                    evaluation_label_dictionary.add_item(label)
            for predicted_values in all_predicted_values.values():
                for label in predicted_values:
                    evaluation_label_dictionary.add_item(label)

            # finally, compute numbers
            y_true = []
            y_pred = []

            for span in all_spans:

                true_values = all_true_values[
                    span] if span in all_true_values else ['O']
                predicted_values = all_predicted_values[
                    span] if span in all_predicted_values else ['O']

                y_true_instance = np.zeros(len(evaluation_label_dictionary),
                                           dtype=int)
                for true_value in true_values:
                    y_true_instance[evaluation_label_dictionary.
                                    get_idx_for_item(true_value)] = 1
                y_true.append(y_true_instance.tolist())

                y_pred_instance = np.zeros(len(evaluation_label_dictionary),
                                           dtype=int)
                for predicted_value in predicted_values:
                    y_pred_instance[evaluation_label_dictionary.
                                    get_idx_for_item(predicted_value)] = 1
                y_pred.append(y_pred_instance.tolist())

        # now, calculate evaluation numbers
        target_names = []
        labels = []

        counter = Counter()
        counter.update(
            list(itertools.chain.from_iterable(all_true_values.values())))
        counter.update(
            list(itertools.chain.from_iterable(all_predicted_values.values())))

        for label_name, count in counter.most_common():
            if label_name == 'O': continue
            if label_name in exclude_labels: continue
            target_names.append(label_name)
            labels.append(
                evaluation_label_dictionary.get_idx_for_item(label_name))

        # there is at least one gold label or one prediction (default)
        if len(all_true_values) + len(all_predicted_values) > 1:
            classification_report = sklearn.metrics.classification_report(
                y_true,
                y_pred,
                digits=4,
                target_names=target_names,
                zero_division=0,
                labels=labels,
            )

            classification_report_dict = sklearn.metrics.classification_report(
                y_true,
                y_pred,
                target_names=target_names,
                zero_division=0,
                output_dict=True,
                labels=labels,
            )

            accuracy_score = round(
                sklearn.metrics.accuracy_score(y_true, y_pred), 4)

            precision_score = round(
                classification_report_dict["micro avg"]["precision"], 4)
            recall_score = round(
                classification_report_dict["micro avg"]["recall"], 4)
            micro_f_score = round(
                classification_report_dict["micro avg"]["f1-score"], 4)
            macro_f_score = round(
                classification_report_dict["macro avg"]["f1-score"], 4)

            main_score = classification_report_dict[main_evaluation_metric[0]][
                main_evaluation_metric[1]]

        else:
            # issue error and default all evaluation numbers to 0.
            log.error(
                "ACHTUNG! No gold labels and no all_predicted_values found! Could be an error in your corpus or how you "
                "initialize the trainer!")
            accuracy_score = precision_score = recall_score = micro_f_score = macro_f_score = main_score = 0.
            classification_report = ""
            classification_report_dict = {}

        detailed_result = ("\nResults:"
                           f"\n- F-score (micro) {micro_f_score}"
                           f"\n- F-score (macro) {macro_f_score}"
                           f"\n- Accuracy {accuracy_score}"
                           "\n\nBy class:\n" + classification_report)

        # line for log file
        log_header = "PRECISION\tRECALL\tF1\tACCURACY"
        log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}"

        if average_over > 0:
            eval_loss /= average_over

        result = Result(main_score=main_score,
                        log_line=log_line,
                        log_header=log_header,
                        detailed_results=detailed_result,
                        classification_report=classification_report_dict,
                        loss=eval_loss)

        return result
Exemplo n.º 26
0
    def predict(
            self,
            sentences: Union[List[Sentence], Sentence],
            mini_batch_size=32,
            all_tag_prob: bool = False,
            verbose: bool = False,
            label_name: Optional[str] = None,
            return_loss=False,
            embedding_storage_mode="none",
    ):
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name == None:
            label_name = self.tag_type

        with torch.no_grad():
            if not sentences:
                return sentences

            if isinstance(sentences, Sentence):
                sentences = [sentences]

            # reverse sort all sequences by their length
            rev_order_len_index = sorted(
                range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True
            )

            reordered_sentences: List[Union[Sentence, str]] = [
                sentences[index] for index in rev_order_len_index
            ]

            dataloader = DataLoader(
                dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size
            )

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            # progress bar for verbosity
            if verbose:
                dataloader = tqdm(dataloader)

            overall_loss = 0
            batch_no = 0
            for batch in dataloader:

                batch_no += 1

                if verbose:
                    dataloader.set_description(f"Inferencing on batch {batch_no}")

                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                feature = self.forward(batch)

                if return_loss:
                    overall_loss += self._calculate_loss(feature, batch)

                tags, all_tags = self._obtain_labels(
                    feature=feature,
                    batch_sentences=batch,
                    transitions=transitions,
                    get_all_tags=all_tag_prob,
                )

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token.add_tag_label(label_name, tag)

                # all_tags will be empty if all_tag_prob is set to False, so the for loop will be avoided
                for (sentence, sent_all_tags) in zip(batch, all_tags):
                    for (token, token_all_tags) in zip(sentence.tokens, sent_all_tags):
                        token.add_tags_proba_dist(label_name, token_all_tags)

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

            if return_loss:
                return overall_loss / batch_no
Exemplo n.º 27
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence, List[str], str],
        mini_batch_size=32,
        embedding_storage_mode="none",
        all_tag_prob: bool = False,
        verbose: bool = False,
        use_tokenizer: Union[bool, Callable[[str], List[Token]]] = space_tokenizer,
    ) -> List[Sentence]:
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a string or a List of Sentence or a List of string.
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param embedding_storage_mode: 'none' for the minimum memory footprint, 'cpu' to store embeddings in Ram,
        'gpu' to store embeddings in GPU memory.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param use_tokenizer: a custom tokenizer when string are provided (default is space based tokenizer).
        :return: List of Sentence enriched by the predicted tags
        """
        with torch.no_grad():
            if not sentences:
                return sentences

            if isinstance(sentences, Sentence) or isinstance(sentences, str):
                sentences = [sentences]

            if (flair.device.type == "cuda") and embedding_storage_mode == "cpu":
                log.warning(
                    "You are inferring on GPU with parameter 'embedding_storage_mode' set to 'cpu'."
                    "This option will slow down your inference, usually 'none' (default value) "
                    "is a better choice."
                )

            # reverse sort all sequences by their length
            rev_order_len_index = sorted(
                range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True
            )
            original_order_index = sorted(
                range(len(rev_order_len_index)), key=lambda k: rev_order_len_index[k]
            )

            reordered_sentences: List[Union[Sentence, str]] = [
                sentences[index] for index in rev_order_len_index
            ]

            if isinstance(sentences[0], Sentence):
                # remove previous embeddings
                store_embeddings(reordered_sentences, "none")
                dataset = SentenceDataset(reordered_sentences)
            else:
                dataset = StringDataset(
                    reordered_sentences, use_tokenizer=use_tokenizer
                )
            dataloader = DataLoader(
                dataset=dataset, batch_size=mini_batch_size, collate_fn=lambda x: x
            )

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            # progress bar for verbosity
            if verbose:
                dataloader = tqdm(dataloader)

            results: List[Sentence] = []
            for i, batch in enumerate(dataloader):

                if verbose:
                    dataloader.set_description(f"Inferencing on batch {i}")
                results += batch
                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                feature: torch.Tensor = self.forward(batch)
                tags, all_tags = self._obtain_labels(
                    feature=feature,
                    batch_sentences=batch,
                    transitions=transitions,
                    get_all_tags=all_tag_prob,
                )

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token.add_tag_label(self.tag_type, tag)

                # all_tags will be empty if all_tag_prob is set to False, so the for loop will be avoided
                for (sentence, sent_all_tags) in zip(batch, all_tags):
                    for (token, token_all_tags) in zip(sentence.tokens, sent_all_tags):
                        token.add_tags_proba_dist(self.tag_type, token_all_tags)

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

            results: List[Union[Sentence, str]] = [
                results[index] for index in original_order_index
            ]
            assert len(sentences) == len(results)
            return results
Exemplo n.º 28
0
    def predict(self,
                sentences: Union[List[Sentence], Sentence],
                label_name='predicted',
                mini_batch_size: int = 16,
                embedding_storage_mode="None",
                return_loss=False,
                print_prediction=False,
                ):
        '''
        Predict lemmas of words for a given (list of) sentence(s).
        :param sentences: sentences to predict
        :param label_name: label name used for predicted lemmas
        :param mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn is set to True
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
            you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        :param return_loss: whether or not to compute and return loss. Setting it to True only makes sense if labels are provided
        :param print_prediction: If True, lemmatized sentences will be printed in the console.
        '''
        if isinstance(sentences, Sentence):
            sentences = [sentences]

        # filter empty sentences
        sentences = [sentence for sentence in sentences if len(sentence) > 0]
        if len(sentences) == 0:
            return sentences

        # max length of the predicted sequences
        if not self.dependent_on_input:
            max_length = self.max_sequence_length
        else:
            max_length = max([len(token.text) + 1 for sentence in sentences for token in sentence])

        # for printing
        line_to_print = ''

        overall_loss = 0
        number_tokens_in_total = 0

        with torch.no_grad():

            dataloader = DataLoader(dataset=SentenceDataset(sentences), batch_size=mini_batch_size)

            for batch in dataloader:

                # stop if all sentences are empty
                if not batch: continue

                # remove previously predicted labels of this type
                for sentence in batch:
                    for token in sentence:
                        token.remove_labels(label_name)

                # create list of tokens in batch
                tokens_in_batch = [token for sentence in batch for token in sentence]
                number_tokens = len(tokens_in_batch)
                number_tokens_in_total += number_tokens

                # encode inputs
                hidden, all_encoder_outputs = self.encode(batch)

                # create input for first pass (batch_size, 1, input_size), first letter is special character <S>
                # sequence length is always set to one in prediction
                input_indices = self.start_index * torch.ones(number_tokens, dtype=torch.long,
                                                              device=flair.device).unsqueeze(1)

                # option 1: greedy decoding
                if self.beam_size == 1:

                    # predictions
                    predicted = [[] for _ in range(number_tokens)]

                    for decode_step in range(max_length):

                        # decode next character
                        output_vectors, hidden = self.decode(input_indices, hidden, all_encoder_outputs)

                        log_softmax_probs = torch.nn.functional.log_softmax(output_vectors, dim=2)
                        # pick top beam size many outputs with highest probabilities
                        input_indices = log_softmax_probs.argmax(dim=2)

                        for i in range(number_tokens):
                            if len(predicted[i]) > 0 and predicted[i][-1] == self.end_index: continue
                            predicted[i].append(input_indices[i].item())

                    for t_id, token in enumerate(tokens_in_batch):
                        predicted_lemma = ''.join(
                            self.char_dictionary.get_item_for_index(idx) if idx != self.end_index else ""
                            for idx in predicted[t_id])
                        token.set_label(typename=label_name, value=predicted_lemma)

                # option 2: beam search
                else:
                    output_vectors, hidden = self.decode(input_indices, hidden, all_encoder_outputs)

                    # out_probs = self.softmax(output_vectors).squeeze(1)
                    log_softmax_probs = torch.nn.functional.log_softmax(output_vectors, dim=2).squeeze(1)
                    # make sure no dummy symbol <> or start symbol <S> is predicted
                    log_softmax_probs[:, self.dummy_index] = -inf
                    log_softmax_probs[:, self.start_index] = -inf

                    # pick top beam size many outputs with highest probabilities
                    # probabilities, leading_indices = out_probs.topk(self.beam_size, 1)  # max prob along dimension 1
                    log_probabilities, leading_indices = log_softmax_probs.topk(self.beam_size, 1)
                    # leading_indices and probabilities have size (batch_size, beam_size)

                    # keep scores of beam_size many hypothesis for each token in the batch
                    scores = log_probabilities.view(-1, 1)
                    # stack all leading indices of all hypothesis and corresponding hidden states in two tensors
                    leading_indices = leading_indices.view(-1, 1)  # this vector goes through RNN in each iteration

                    hidden_states_beam = torch.stack(self.beam_size * [hidden], dim=2).view(self.rnn_layers, -1,
                                                                                            self.rnn_hidden_size)

                    # save sequences so far
                    sequences = torch.tensor([[i.item()] for i in leading_indices], device=flair.device)

                    # keep track of how many hypothesis were completed for each token
                    n_completed = [0 for _ in range(number_tokens)]  # cpu
                    final_candidates = [[] for _ in range(number_tokens)]  # cpu

                    # if all_encoder_outputs returned, expand them to beam size (otherwise keep this as None)
                    batched_encoding_output = torch.stack(self.beam_size * [all_encoder_outputs], dim=1).view(
                        self.beam_size * number_tokens, -1, self.rnn_hidden_size) if self.use_attention else None

                    for j in range(1, max_length):

                        output_vectors, hidden_states_beam = self.decode(leading_indices,
                                                                         hidden_states_beam,
                                                                         batched_encoding_output)

                        # decode with log softmax
                        out_log_probs = torch.nn.functional.log_softmax(output_vectors, dim=2)
                        # make sure no dummy symbol <> or start symbol <S> is predicted
                        out_log_probs[:, 0, self.dummy_index] = -inf
                        out_log_probs[:, 0, self.start_index] = -inf
                        log_probabilities, index_candidates = out_log_probs.topk(self.beam_size, 2)
                        log_probabilities.squeeze_(1)
                        index_candidates.squeeze_(1)

                        # check if an end symbol <E> has been predicted and, in that case, set hypothesis aside
                        end_symbols = (index_candidates == self.end_index).nonzero(as_tuple=False)
                        for tuple in end_symbols:

                            # if the sequence is already ended, do not record as candidate
                            if sequences[tuple[0], -1].item() == self.end_index: continue

                            # index of token in in list tokens_in_batch
                            token_number = torch.div(tuple[0], self.beam_size, rounding_mode='trunc')
                            # print(token_number)
                            seq = sequences[tuple[0], :]  # hypothesis sequence
                            # hypothesis score
                            score = (scores[tuple[0]] + log_probabilities[tuple[0], tuple[1]]) / (len(seq) + 1)

                            final_candidates[token_number].append((seq, score))
                            # TODO: remove token if number of completed hypothesis exceeds given value
                            n_completed[token_number] += 1

                            # set score of corresponding entry to -inf so it will not be expanded
                            log_probabilities[tuple[0], tuple[1]] = -inf

                        # get leading_indices for next expansion
                        # find highest scoring hypothesis among beam_size*beam_size possible ones for each token

                        # take beam_size many copies of scores vector and add scores of possible new extensions
                        # size (beam_size*batch_size, beam_size)
                        hypothesis_scores = torch.cat(self.beam_size * [scores], dim=1) + log_probabilities
                        # print(hypothesis_scores)

                        # reshape to vector of size (batch_size, beam_size*beam_size), each row contains beam_size*beam_size scores of the new possible hypothesis
                        hypothesis_scores_per_token = hypothesis_scores.view(number_tokens, self.beam_size ** 2)
                        # print(hypothesis_scores_per_token)

                        # choose beam_size best for each token - size (batch_size, beam_size)
                        best_scores, indices_per_token = hypothesis_scores_per_token.topk(self.beam_size, 1)

                        # out of indices_per_token we now need to recompute the original indices of the hypothesis in a list of length beam_size*batch_size
                        # where the first three inidices belong to the first token, the next three to the second token, and so on
                        beam_numbers = []
                        seq_numbers = []

                        for i, row in enumerate(indices_per_token):
                            beam_numbers.extend(i * self.beam_size + index.item() // self.beam_size for index in row)

                            seq_numbers.extend(index.item() % self.beam_size for index in row)

                        # with these indices we can compute the tensors for the next iteration
                        # expand sequences with corresponding index
                        sequences = torch.cat(
                            (sequences[beam_numbers], index_candidates[beam_numbers, seq_numbers].unsqueeze(1)), dim=1)

                        # add log-probabilities to the scores
                        scores = scores[beam_numbers] + log_probabilities[beam_numbers, seq_numbers].unsqueeze(1)

                        # save new leading indices
                        leading_indices = index_candidates[beam_numbers, seq_numbers].unsqueeze(1)

                        # save corresponding hidden states
                        hidden_states_beam = hidden_states_beam[:, beam_numbers, :]

                    # it may happen that no end symbol <E> is predicted for a token in all of the max_length iterations
                    # in that case we append one of the final seuqences without end symbol to the final_candidates
                    best_scores, indices = scores.view(number_tokens, -1).topk(1, 1)

                    for j, (score, index) in enumerate(zip(best_scores.squeeze(1), indices.squeeze(1))):
                        if len(final_candidates[j]) == 0:
                            beam = j * self.beam_size + index.item()
                            final_candidates[j].append((sequences[beam, :], score / max_length))

                    # get best final hypothesis for each token
                    output_sequences = []
                    for l in final_candidates:
                        l_ordered = sorted(l, key=lambda tup: tup[1], reverse=True)
                        output_sequences.append(l_ordered[0])

                    # get characters from index sequences and add predicted label to token
                    for i, seq in enumerate(output_sequences):
                        predicted_lemma = ''
                        for idx in seq[0]:
                            predicted_lemma += self.char_dictionary.get_item_for_index(idx)
                        line_to_print += predicted_lemma
                        line_to_print += ' '
                        tokens_in_batch[i].add_tag(tag_type=label_name, tag_value=predicted_lemma)

                if return_loss:
                    overall_loss += self.forward_loss(batch)[0].item()

                store_embeddings(batch, storage_mode=embedding_storage_mode)

            if print_prediction:
                print(line_to_print)

            if return_loss:
                return overall_loss, number_tokens_in_total