예제 #1
0
    def convert_predict_batch(self, args, data, tokenizer, batchify_fn,
                              label_list):
        examples = []
        for example in data:
            example = convert_example(example,
                                      label_list,
                                      tokenizer,
                                      max_seq_length=args.max_seq_length)
            examples.append(example)

        return examples
예제 #2
0
    def predict(self, data, tokenizer):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        if args.benchmark:
            self.autolog.times.start()

        examples = []
        for text in data:
            input_ids, segment_ids = convert_example(text,
                                                     tokenizer,
                                                     is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
                ),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
                ),  # segment
        ): fn(samples)

        if args.benchmark:
            self.autolog.times.stamp()

        input_ids, segment_ids = batchify_fn(examples)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        logits = self.output_handle.copy_to_cpu()
        if args.benchmark:
            self.autolog.times.stamp()

        probs = softmax(logits, axis=1)
        idx = np.argmax(probs, axis=1)
        idx = idx.tolist()

        if args.benchmark:
            self.autolog.times.end(stamp=True)

        return probs
예제 #3
0
    def predict(self, data, vocabs):
        word_vocab, _, rel_vocab = vocabs
        word_pad_index = word_vocab.to_indices("[PAD]")
        word_bos_index = word_vocab.to_indices("[CLS]")
        word_eos_index = word_vocab.to_indices("[SEP]")
        examples = []
        for text in data:
            example = {
                "FORM": text["FORM"],
                "CPOS": text["CPOS"],
            }
            example = convert_example(
                example,
                vocabs=vocabs,
                mode="test",
            )
            examples.append(example)

        batches = [
            examples[idx:idx + args.batch_size]
            for idx in range(0, len(examples), args.batch_size)
        ]

        arcs, rels = [], []
        for batch in batches:
            words = batchify_fn(batch)[0]
            words, position = flat_words(words, word_pad_index)
            self.input_handles[0].copy_from_cpu(words)
            self.input_handles[1].copy_from_cpu(position)
            self.predictor.run()
            s_arc = self.output_handle[0].copy_to_cpu()
            s_rel = self.output_handle[1].copy_to_cpu()
            words = self.output_handle[2].copy_to_cpu()

            mask = np.logical_and(
                np.logical_and(words != word_pad_index,
                               words != word_bos_index),
                words != word_eos_index,
            )

            arc_preds, rel_preds = decode(s_arc, s_rel, mask, args.tree)

            arcs.extend([arc_pred[m] for arc_pred, m in zip(arc_preds, mask)])
            rels.extend([rel_pred[m] for rel_pred, m in zip(rel_preds, mask)])

        arcs = [[str(s) for s in seq] for seq in arcs]
        rels = [rel_vocab.to_tokens(seq) for seq in rels]
        return arcs, rels
예제 #4
0
    def predict(self, data, tokenizer, batch_size=1, threshold=0.5):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
                A Example object contains `text`(word_ids) and `se_len`(sequence length).
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
            batch_size(obj:`int`, defaults to 1): The number of batch.
            threshold(obj:`int`, defaults to 0.5): The threshold for converting probabilities to labels.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """
        examples = []
        for text in data:
            example = {"text": text}
            input_ids, segment_ids = convert_example(
                example,
                tokenizer,
                max_seq_length=self.max_seq_length,
                is_test=True)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        # Seperates data into some batches.
        batches = [
            examples[idx:idx + batch_size]
            for idx in range(0, len(examples), batch_size)
        ]

        results = []
        for batch in batches:
            input_ids, segment_ids = batchify_fn(batch)
            self.input_handles[0].copy_from_cpu(input_ids)
            self.input_handles[1].copy_from_cpu(segment_ids)
            self.predictor.run()
            logits = paddle.to_tensor(self.output_handle.copy_to_cpu())
            probs = F.sigmoid(logits)
            preds = (probs.numpy() > threshold).astype(int)
            results.extend(preds)
        return results
예제 #5
0
    def predict(self, data, tokenizer):
        """
        Predicts the data labels.

        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.

        Returns:
            results(obj:`dict`): All the predictions labels.
        """

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        all_embeddings = []
        examples = []
        for idx, text in enumerate(tqdm(data)):
            input_ids, segment_ids = convert_example(
                text,
                tokenizer,
                max_seq_length=self.max_seq_length,
                pad_to_max_seq_len=True)
            examples.append((input_ids, segment_ids))
            if (len(examples) >= 100):
                input_ids, segment_ids = batchify_fn(examples)
                self.input_handles[0].copy_from_cpu(input_ids)
                self.input_handles[1].copy_from_cpu(segment_ids)
                self.predictor.run()
                logits = self.output_handle.copy_to_cpu()
                all_embeddings.append(logits)
                examples = []

        if (len(examples) > 0):
            input_ids, segment_ids = batchify_fn(examples)
            self.input_handles[0].copy_from_cpu(input_ids)
            self.input_handles[1].copy_from_cpu(segment_ids)
            self.predictor.run()
            logits = self.output_handle.copy_to_cpu()
            all_embeddings.append(logits)

        all_embeddings = np.concatenate(all_embeddings, axis=0)
        np.save('corpus_embedding', all_embeddings)
예제 #6
0
def do_predict(data,
               model,
               tokenizer,
               viterbi_decoder,
               tags_to_idx,
               idx_to_tags,
               batch_size=1,
               summary_num=2):

    examples = []
    for text in data:
        example = {"tokens": list(text)}
        input_ids, token_type_ids, seq_len = convert_example(example,
                                                             tokenizer,
                                                             args.max_seq_len,
                                                             is_test=True)

        examples.append((input_ids, token_type_ids, seq_len))

    batches = [
        examples[idx:idx + batch_size]
        for idx in range(0, len(examples), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
    ): fn(samples)

    all_pred_tags = []

    model.eval()
    for batch in batches:
        input_ids, token_type_ids, seq_len = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        seq_len = paddle.to_tensor(seq_len)
        pred_tags = model(input_ids, token_type_ids, lengths=seq_len)
        all_pred_tags.extend(pred_tags.numpy().tolist())
    results = decode(data, all_pred_tags, summary_num, idx_to_tags)
    return results
예제 #7
0
def do_predict(data,
               model,
               tokenizer,
               batch_size=1,
               max_cls_len=5,
               summary_num=2):
    examples = []
    for text in data:
        example = {"text": text}
        input_ids, token_type_ids, label_indices = convert_example(
            example, tokenizer, max_seq_len=args.max_seq_len, is_test=True)
        examples.append((input_ids, token_type_ids, label_indices))

    batches = [
        examples[idx:idx + batch_size]
        for idx in range(0, len(examples), batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Stack(dtype='int64'),  # input_ids
        Stack(dtype='int64'),  # token_type_ids
        Stack(dtype='int64'),  # label_indices
    ): fn(samples)

    name_dict, bk_tree, id_vocabs, vocab_ids = construct_dict_map(
        tokenizer, os.path.join(args.data_dir, "name_category_map.json"))

    all_scores_can = []
    all_preds_can = []
    pred_ids = []

    model.eval()
    for batch in batches:
        input_ids, token_type_ids, label_indices = batchify_fn(batch)

        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        logits = model(input_ids, token_type_ids).numpy()
        for i, l in zip(label_indices, logits):
            score = l[i[0]:i[-1] + 1, vocab_ids]
            # Find topk candidates of scores and predicted indices.
            score_can, pred_id_can = find_topk(score, k=4, axis=-1)

            all_scores_can.extend([score_can.tolist()])
            all_preds_can.extend([pred_id_can.tolist()])
            pred_ids.extend([pred_id_can[:, 0].tolist()])

    results = []
    for i, d in enumerate(data):
        label = decode(pred_ids[i], id_vocabs)

        result = {
            'text': d,
            'label': label,
        }

        if label not in name_dict:
            scores_can = all_scores_can[i]
            pred_ids_can = all_preds_can[i]
            labels_can = search(scores_can, pred_ids_can, 0, [], 0)
            labels_can.sort(key=lambda d: -d[1])
            for labels in labels_can:
                cls_label_can = decode(labels[0], id_vocabs)
                if cls_label_can in name_dict:
                    result['label'] = cls_label_can
                    break
                else:
                    labels_can = bk_tree.search_similar_word(label)
                    result['label'] = labels_can[0][0]

        result['category'] = name_dict[result['label']]
        results.append(result)
    return results