def convert_predict_batch(self, args, data, tokenizer, batchify_fn, label_list): examples = [] for example in data: example = convert_example(example, label_list, tokenizer, max_seq_length=args.max_seq_length) examples.append(example) return examples
def predict(self, data, tokenizer): """ Predicts the data labels. Args: data (obj:`List(str)`): The batch data whose each element is a raw text. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Returns: results(obj:`dict`): All the predictions labels. """ if args.benchmark: self.autolog.times.start() examples = [] for text in data: input_ids, segment_ids = convert_example(text, tokenizer, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # segment ): fn(samples) if args.benchmark: self.autolog.times.stamp() input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() if args.benchmark: self.autolog.times.stamp() probs = softmax(logits, axis=1) idx = np.argmax(probs, axis=1) idx = idx.tolist() if args.benchmark: self.autolog.times.end(stamp=True) return probs
def predict(self, data, vocabs): word_vocab, _, rel_vocab = vocabs word_pad_index = word_vocab.to_indices("[PAD]") word_bos_index = word_vocab.to_indices("[CLS]") word_eos_index = word_vocab.to_indices("[SEP]") examples = [] for text in data: example = { "FORM": text["FORM"], "CPOS": text["CPOS"], } example = convert_example( example, vocabs=vocabs, mode="test", ) examples.append(example) batches = [ examples[idx:idx + args.batch_size] for idx in range(0, len(examples), args.batch_size) ] arcs, rels = [], [] for batch in batches: words = batchify_fn(batch)[0] words, position = flat_words(words, word_pad_index) self.input_handles[0].copy_from_cpu(words) self.input_handles[1].copy_from_cpu(position) self.predictor.run() s_arc = self.output_handle[0].copy_to_cpu() s_rel = self.output_handle[1].copy_to_cpu() words = self.output_handle[2].copy_to_cpu() mask = np.logical_and( np.logical_and(words != word_pad_index, words != word_bos_index), words != word_eos_index, ) arc_preds, rel_preds = decode(s_arc, s_rel, mask, args.tree) arcs.extend([arc_pred[m] for arc_pred, m in zip(arc_preds, mask)]) rels.extend([rel_pred[m] for rel_pred, m in zip(rel_preds, mask)]) arcs = [[str(s) for s in seq] for seq in arcs] rels = [rel_vocab.to_tokens(seq) for seq in rels] return arcs, rels
def predict(self, data, tokenizer, batch_size=1, threshold=0.5): """ Predicts the data labels. Args: data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `se_len`(sequence length). tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. batch_size(obj:`int`, defaults to 1): The number of batch. threshold(obj:`int`, defaults to 0.5): The threshold for converting probabilities to labels. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: example = {"text": text} input_ids, segment_ids = convert_example( example, tokenizer, max_seq_length=self.max_seq_length, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) # Seperates data into some batches. batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] results = [] for batch in batches: input_ids, segment_ids = batchify_fn(batch) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = paddle.to_tensor(self.output_handle.copy_to_cpu()) probs = F.sigmoid(logits) preds = (probs.numpy() > threshold).astype(int) results.extend(preds) return results
def predict(self, data, tokenizer): """ Predicts the data labels. Args: data (obj:`List(str)`): The batch data whose each element is a raw text. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Returns: results(obj:`dict`): All the predictions labels. """ batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) all_embeddings = [] examples = [] for idx, text in enumerate(tqdm(data)): input_ids, segment_ids = convert_example( text, tokenizer, max_seq_length=self.max_seq_length, pad_to_max_seq_len=True) examples.append((input_ids, segment_ids)) if (len(examples) >= 100): input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() all_embeddings.append(logits) examples = [] if (len(examples) > 0): input_ids, segment_ids = batchify_fn(examples) self.input_handles[0].copy_from_cpu(input_ids) self.input_handles[1].copy_from_cpu(segment_ids) self.predictor.run() logits = self.output_handle.copy_to_cpu() all_embeddings.append(logits) all_embeddings = np.concatenate(all_embeddings, axis=0) np.save('corpus_embedding', all_embeddings)
def do_predict(data, model, tokenizer, viterbi_decoder, tags_to_idx, idx_to_tags, batch_size=1, summary_num=2): examples = [] for text in data: example = {"tokens": list(text)} input_ids, token_type_ids, seq_len = convert_example(example, tokenizer, args.max_seq_len, is_test=True) examples.append((input_ids, token_type_ids, seq_len)) batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len ): fn(samples) all_pred_tags = [] model.eval() for batch in batches: input_ids, token_type_ids, seq_len = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) seq_len = paddle.to_tensor(seq_len) pred_tags = model(input_ids, token_type_ids, lengths=seq_len) all_pred_tags.extend(pred_tags.numpy().tolist()) results = decode(data, all_pred_tags, summary_num, idx_to_tags) return results
def do_predict(data, model, tokenizer, batch_size=1, max_cls_len=5, summary_num=2): examples = [] for text in data: example = {"text": text} input_ids, token_type_ids, label_indices = convert_example( example, tokenizer, max_seq_len=args.max_seq_len, is_test=True) examples.append((input_ids, token_type_ids, label_indices)) batches = [ examples[idx:idx + batch_size] for idx in range(0, len(examples), batch_size) ] batchify_fn = lambda samples, fn=Tuple( Stack(dtype='int64'), # input_ids Stack(dtype='int64'), # token_type_ids Stack(dtype='int64'), # label_indices ): fn(samples) name_dict, bk_tree, id_vocabs, vocab_ids = construct_dict_map( tokenizer, os.path.join(args.data_dir, "name_category_map.json")) all_scores_can = [] all_preds_can = [] pred_ids = [] model.eval() for batch in batches: input_ids, token_type_ids, label_indices = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) logits = model(input_ids, token_type_ids).numpy() for i, l in zip(label_indices, logits): score = l[i[0]:i[-1] + 1, vocab_ids] # Find topk candidates of scores and predicted indices. score_can, pred_id_can = find_topk(score, k=4, axis=-1) all_scores_can.extend([score_can.tolist()]) all_preds_can.extend([pred_id_can.tolist()]) pred_ids.extend([pred_id_can[:, 0].tolist()]) results = [] for i, d in enumerate(data): label = decode(pred_ids[i], id_vocabs) result = { 'text': d, 'label': label, } if label not in name_dict: scores_can = all_scores_can[i] pred_ids_can = all_preds_can[i] labels_can = search(scores_can, pred_ids_can, 0, [], 0) labels_can.sort(key=lambda d: -d[1]) for labels in labels_can: cls_label_can = decode(labels[0], id_vocabs) if cls_label_can in name_dict: result['label'] = cls_label_can break else: labels_can = bk_tree.search_similar_word(label) result['label'] = labels_can[0][0] result['category'] = name_dict[result['label']] results.append(result) return results