Exemplo n.º 1
0
 def finetune(self, Xs, Y=None, batch_size=None):
     Xs, Y_new = indico_to_finetune_sequence(Xs,
                                             labels=Y,
                                             multi_label=self.multi_label,
                                             none_value="<PAD>")
     Y = Y_new if Y is not None else None
     return super().finetune(Xs, Y=Y, batch_size=batch_size)
Exemplo n.º 2
0
    def test_three_overlapping_labels(self):
        raw = ["Indico Is the best"]
        finetunex = [["Indico ", "Is the", " best"]]
        finetuney = [[("<PAD>", ), ("1", "2", "3"), ("1", "3")]]
        indicox_pred, indicoy_pred = finetune_to_indico_sequence(
            raw, finetunex, finetuney)
        indicoy = [[{
            'start': 7,
            'end': 13,
            'label': '2',
            'text': 'Is the'
        }, {
            'start': 7,
            'end': 18,
            'label': '1',
            'text': 'Is the best'
        }, {
            'start': 7,
            'end': 18,
            'label': '3',
            'text': 'Is the best'
        }]]
        self.assertEqual(indicoy, indicoy_pred)
        self.assertEqual(raw, indicox_pred)

        finetunex_pred, finetuney_pred = indico_to_finetune_sequence(
            raw, indicoy)
        self.assertEqual(finetunex_pred, finetunex)
        self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0])
        self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1])
        self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
Exemplo n.º 3
0
 def finetune(self, X, Y, batch_size=None):
     """
     :param X: A list of text snippets. Format: [batch_size]
     :param Y: A list of lists of annotations. Format: [batch_size, n_annotations], where each annotation is of the form:
         {'start': char_idx, 'end': char_idx, 'label': 'label'}
     :param batch_size: integer number of examples per batch. When N_GPUS > 1, this number
                        corresponds to the number of training examples provided to each GPU.
     :param val_size: Float fraction or int number that represents the size of the validation set.
     :param val_interval: The interval for which validation is performed, measured in number of steps.
     """
     X, Y = indico_to_finetune_sequence(X, Y, none_value="<PAD>")
     self.target_type = SEQUENCE_LABELING
     return self._finetune(X, Y, batch_size=batch_size)
Exemplo n.º 4
0
 def finetune(self, X, Y=None, batch_size=None):
     """
     :param X: A list of text snippets. Format: [batch_size]
     :param Y: A list of lists of annotations. Format: [batch_size, n_annotations], where each annotation is of the form:
         {'start': 0, 'end': 5, 'label': 'class', 'text': 'sample text'}
     :param batch_size: integer number of examples per batch. When N_GPUS > 1, this number
                        corresponds to the number of training examples provided to each GPU.
     :param val_size: Float fraction or int number that represents the size of the validation set.
     :param val_interval: The interval for which validation is performed, measured in number of steps.
     """
     fit_language_model_only = (Y is None)
     X, Y = indico_to_finetune_sequence(X, Y, none_value="<PAD>")
     arr_encoded = self._text_to_ids(X, Y=Y)
     labels = None if fit_language_model_only else arr_encoded.labels
     return self._training_loop(arr_encoded, Y=labels, batch_size=batch_size)
Exemplo n.º 5
0
    def predict(self, X, max_length=None):
        """
        Produces a list of most likely class labels as determined by the fine-tuned model.

        :param X: A list / array of text, shape [batch]
        :param max_length: the number of tokens to be included in the document representation.
                           Providing more than `max_length` tokens as input will result in truncatindiion.
        :returns: list of class labels.
        """
        doc_subseqs, _ = indico_to_finetune_sequence(X)

        arr_encoded = self._text_to_ids(doc_subseqs)
        labels = self._predict(doc_subseqs, max_length=max_length)
        all_subseqs = []
        all_labels = []
        for text, label_seq, position_seq in zip(X, labels, arr_encoded.char_locs):
            
            start_of_token = 0
            doc_subseqs = []
            doc_labels = []

            for label, position in zip(label_seq, position_seq):
                if position == -1:
                    # indicates padding / special tokens
                    continue

                # if there are no current subsequence
                # or the current subsequence has the wrong label
                if not doc_subseqs or label != doc_labels[-1]:
                    # start new subsequence
                    doc_subseqs.append(text[start_of_token:position])
                    doc_labels.append(label)
                else:
                    # continue appending to current subsequence
                    doc_subseqs[-1] += text[start_of_token:position]

                start_of_token = position
            all_subseqs.append(doc_subseqs)
            all_labels.append(doc_labels)

        _, doc_annotations = finetune_to_indico_sequence(
            raw_texts=X,
            subseqs=all_subseqs,
            labels=all_labels,
            subtoken_predictions=self.config.subtoken_predictions
        )

        return doc_annotations
Exemplo n.º 6
0
    def predict_proba(self, X, max_length=None):
        """
        Produces a list of most likely class labels as determined by the fine-tuned model.

        :param X: A list / array of text, shape [batch]
        :param max_length: the number of tokens to be included in the document representation.
                           Providing more than `max_length` tokens as input will result in truncatindiion.
        :returns: list of class labels.
        """
        doc_subseqs, _ = indico_to_finetune_sequence(X)
        arr_encoded = self._text_to_ids_with_labels(doc_subseqs)
        batch_probas = self._predict_proba(X, max_length=max_length)
        result = []
        for token_seq, proba_seq in zip(arr_encoded.tokens, batch_probas):
            result.append(list(zip(token_seq, proba_seq)))
        return result
Exemplo n.º 7
0
    def predict(self, X, max_length=None):
        """
        Produces a list of most likely class labels as determined by the fine-tuned model.

        :param X: A list / array of text, shape [batch]
        :param max_length: the number of tokens to be included in the document representation.
                           Providing more than `max_length` tokens as input will result in truncatindiion.
        :returns: list of class labels.
        """
        doc_subseqs, _ = indico_to_finetune_sequence(X)

        max_length = max_length or self.config.max_length
        chunk_size = max_length - 2
        step_size = chunk_size // 3
        
        arr_encoded = self._text_to_ids(doc_subseqs)

        labels = []
        batch_probas = []
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            max_length = max_length or self.config.max_length
            for xmb, mmb in self._infer_prep(doc_subseqs, max_length=max_length):
                output = self._eval(self.predict_op,
                    feed_dict={
                        self.X: xmb,
                        self.M: mmb,
                        self.do_dropout: DROPOUT_OFF
                    }
                )
                prediction, probas = output.get(self.predict_op)
                batch_probas.extend(probas)
                formatted_predictions = self.label_encoder.inverse_transform(prediction)
                labels.extend(formatted_predictions)


        all_subseqs = []
        all_labels = []
        all_probs = []

        doc_idx = -1
        
        
        for chunk_idx, (label_seq, position_seq, proba_seq) in enumerate(zip(labels, arr_encoded.char_locs, batch_probas)):
            start_of_doc = arr_encoded.token_ids[chunk_idx][0][0] == self.encoder.start
            end_of_doc = (
                chunk_idx + 1 >= len(arr_encoded.char_locs) or 
                arr_encoded.token_ids[chunk_idx + 1][0][0] == self.encoder.start
            )

            """
            Chunk idx for prediction.  Dividers at `step_size` increments.
            [  1  |  1  |  2  |  3  |  3  ]
            """
            if start_of_doc:
                # if this is the first chunk in a document, start accumulating from scratch
                doc_subseqs = []
                doc_labels = []
                doc_probs = []
                doc_idx += 1
                prob_accum = 0
                start_of_token = 0
                if not end_of_doc:
                    # predict only on first two thirds
                    label_seq, position_seq, proba_seq = label_seq[:step_size*2], position_seq[:step_size*2], proba_seq[:step_size*2]
            else:
                if end_of_doc:
                    # predict on the rest of sequence
                    label_seq, position_seq, proba_seq = label_seq[step_size:], position_seq[step_size:], proba_seq[step_size:]
                else:
                    # predict only on middle third
                    label_seq, position_seq, proba_seq = label_seq[step_size:step_size*2], position_seq[step_size: step_size*2], proba_seq[step_size:step_size*2]

            for label, position, proba in zip(label_seq, position_seq, proba_seq):
                if position == -1:
                    # indicates padding / special tokens
                    continue

                # if there are no current subsequence
                # or the current subsequence has the wrong label
                if not doc_subseqs or label != doc_labels[-1]:
                    # start new subsequence
                    doc_subseqs.append(X[doc_idx][start_of_token:position])
                    doc_labels.append(label)
                    doc_probs.append([proba])
                else:
                    # continue appending to current subsequence
                    doc_subseqs[-1] += X[doc_idx][start_of_token:position]
                    doc_probs[-1].append(proba)

                start_of_token = position

            if end_of_doc:
                # last chunk in a document

                prob_dicts = []
                for prob_seq in doc_probs:
                    # format probabilities as dictionary
                    probs = np.mean(np.vstack(prob_seq), axis=0)
                    prob_dicts.append(dict(zip(self.label_encoder.classes_, probs)))
                
                all_subseqs.append(doc_subseqs)
                all_labels.append(doc_labels)
                all_probs.append(prob_dicts)

        _, doc_annotations = finetune_to_indico_sequence(
            raw_texts=X,
            subseqs=all_subseqs,
            labels=all_labels,
            probs=all_probs,
            subtoken_predictions=self.config.subtoken_predictions
        )

        return doc_annotations