def main(): args = parse_args() predictor = Predictor.create_predictor(args) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained( os.path.dirname(args.model_name_or_path)) if args.predict_file: dataset = load_dataset('sqaud', data_files=args.predict_file) elif args.version_2_with_negative: dataset = load_dataset('squad', splits='dev_v2') else: dataset = load_dataset('squad', splits='dev_v1') dataset.map(partial( prepare_validation_features, tokenizer=tokenizer, args=args), batched=True) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) predictor = Predictor.create_predictor(args) predictor.predict(dataset, args=args, collate_fn=batchify_fn)
def reader(): # Create the tokenizer and dataset tokenizer = BertTokenizer.from_pretrained(args.model_dir) train_ds = load_dataset('glue', args.task, splits="train") trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=128, is_test=True) train_ds = train_ds.map(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type ): fn(samples) train_batch_sampler = paddle.io.BatchSampler(train_ds, batch_size=32, shuffle=True) [input_ids, token_type_ids, labels] = create_data_holder(args.task) feed_list_name = [] train_data_loader = DataLoader(dataset=train_ds, feed_list=[input_ids, token_type_ids], batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=False) dev_trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=128) dev_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) dev_ds = load_dataset('glue', args.task, splits='dev') dev_ds = dev_ds.map(dev_trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=32, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, num_workers=0, feed_list=[input_ids, token_type_ids, labels], return_list=False) return train_data_loader, dev_data_loader
def create_infer_loader(args): if args.test_file is not None: dataset = load_dataset('wmt14ende', data_files=[args.test_file], splits=['test']) else: dataset = load_dataset('wmt14ende', splits=('test')) if args.vocab_file is not None: src_vocab = Vocab.load_vocabulary(filepath=args.vocab_file, unk_token=args.unk_token, bos_token=args.bos_token, eos_token=args.eos_token) elif not args.benchmark: src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target dataset = dataset.map(convert_samples, lazy=False) batch_sampler = SamplerHelper(dataset).batch( batch_size=args.infer_batch_size, drop_last=False) data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial(prepare_infer_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq, dtype=args.input_dtype), num_workers=args.num_workers, return_list=True) return data_loader, trg_vocab.to_tokens
def create_infer_loader(args): batch_size = args.batch_size max_len = args.max_len test_ds = load_dataset('iwslt15', splits='test') src_vocab = Vocab.load_vocabulary(**test_ds.vocab_info['en']) tgt_vocab = Vocab.load_vocabulary(**test_ds.vocab_info['vi']) bos_id = src_vocab[src_vocab.bos_token] eos_id = src_vocab[src_vocab.eos_token] pad_id = eos_id def convert_example(example): source = example['en'].split() target = example['vi'].split() source = src_vocab.to_indices(source) target = tgt_vocab.to_indices(target) return source, target test_ds.map(convert_example) test_batch_sampler = SamplerHelper(test_ds).batch(batch_size=batch_size) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=partial(prepare_infer_input, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id)) return test_loader, len(src_vocab), len(tgt_vocab), bos_id, eos_id
def create_data_loader(batch_size, num_steps, data_path): train_ds, valid_ds, test_ds = load_dataset('ptb', splits=('train', 'valid', 'test')) train_examples = [ train_ds[i]['sentence'].split() for i in range(len(train_ds)) ] vocab = Vocab.build_vocab(train_examples, eos_token='</eos>') # Because the sentences in PTB dataset might be consecutive, we need to concatenate # all texts from our dataset and fold them into chunks while the number of rows is # equal to batch size. For example: # # Sentence1: we're talking about years ago before anyone heard of asbestos having # any questionable properties. # Sentence2: there is no asbestos in our products now. # Batch_size: 5 # Grouped_text: [["we're", "talking", "about", "years"], # ["ago", "before", "anyone", "heard"], # ["of", "asbestos", "having", "any"], # ["questionable", "properties", "there", "is"], # ["no", "asbestos", "in", "our"]] # def group_texts(examples): concat_examples = [] for example in examples: concat_examples += example['sentence'].split() + ['</eos>'] concat_examples = vocab.to_indices(concat_examples) max_seq_len = len(concat_examples) // batch_size reshaped_examples = np.asarray(concat_examples[0:batch_size * max_seq_len], dtype='int64').reshape( (batch_size, max_seq_len)) encoded_examples = [] for i in range(max_seq_len // num_steps): encoded_examples.append( (np.copy(reshaped_examples[:, i * num_steps:(i + 1) * num_steps]), np.copy(reshaped_examples[:, i * num_steps + 1:(i + 1) * num_steps + 1]))) return encoded_examples train_ds.map(group_texts, batched=True) valid_ds.map(group_texts, batched=True) test_ds.map(group_texts, batched=True) train_loader = paddle.io.DataLoader(train_ds, return_list=True, batch_size=None) valid_loader = paddle.io.DataLoader(valid_ds, return_list=True, batch_size=None) test_loader = paddle.io.DataLoader(test_ds, return_list=True, batch_size=None) return train_loader, valid_loader, test_loader, len(vocab)
def create_infer_loader(args): dataset = load_dataset('wmt14ende', splits=('test')) src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target dataset = dataset.map(convert_samples, lazy=False) batch_sampler = SamplerHelper(dataset).batch( batch_size=args.infer_batch_size, drop_last=False) data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial(prepare_infer_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx), num_workers=0, return_list=True) return data_loader, trg_vocab.to_tokens
def do_client(idx, args): dataset = load_dataset('wmt14ende', splits=('test')) headers = {"Content-type": "application/json"} url = "http://127.0.0.1:9292/transformer/prediction" batch = [] sample = 0 f = open(args.output_file, "w") if args.profile: recorder = Recorder(args.infer_batch_size, args.model_name) recorder.tic() for sequence in dataset: sample += 1 batch.append(sequence[args.src_lang]) if len(batch) < args.infer_batch_size and sample != len(dataset): continue data = {"feed": [{"src_word": batch}], "fetch": ["finished_sequence"]} r = requests.post(url=url, headers=headers, data=json.dumps(data)) if r is not None: print("Status: ", r) if args.profile: recorder.toc(samples=len(batch)) else: for seq in r.json()["result"]["finished_sequence"]: f.write(seq[0] + "\n") batch = [] if args.profile: recorder.tic() f.close() if args.profile: recorder.report() return [[recorder.infer_time]]
def main(): args = parse_args() predictor = Predictor.create_predictor(args) args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] test_ds = load_dataset('glue', args.task_name, splits="test") tokenizer = tokenizer_class.from_pretrained( os.path.dirname(args.model_path)) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=test_ds.label_list, max_seq_length=args.max_seq_length, is_test=True) test_ds = test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # segment ): fn(samples) predictor.predict( test_ds, batch_size=args.batch_size, collate_fn=batchify_fn)
def __init__(self, args={}): super(TransformerReader, self).__init__() dataset = load_dataset('wmt14ende', splits=('test')) if not args.benchmark: self.vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) else: self.vocab = Vocab.load_vocabulary( **dataset.vocab_info["benchmark"]) self.src_vocab = self.trg_vocab = self.vocab def convert_samples(samples): source = [] for sample in samples: src = sample.split() source.append(self.src_vocab.to_indices(src)) return source self.tokenize = convert_samples self.to_tokens = self.trg_vocab.to_tokens self.feed_keys = ["src_word"] self.bos_idx = args.bos_idx self.eos_idx = args.eos_idx self.pad_idx = args.bos_idx self.pad_seq = args.pad_seq self.word_pad = Pad(self.pad_idx)
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.map(trans_func) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def test_test_set(self): expected_len = 25000 expected_text, expected_label = get_examples(self.config['splits']) test_ds = load_dataset(**self.config) self.check_output_equal(len(test_ds), expected_len) self.check_output_equal(expected_text, test_ds[23]['text']) self.check_output_equal(expected_label, test_ds[23]['label'])
def test_tnews_dataset(runner): from paddlenlp.datasets import load_dataset dev_ds = load_dataset('clue', "tnews", splits='dev') batches = [] labels = [] idx = 0 batch_size = 32 while idx < len(dev_ds): datas = [] label = [] for i in range(batch_size): if idx + i >= len(dev_ds): break datas.append(dev_ds[idx + i]["sentence"]) label.append(dev_ds[idx + i]["label"]) batches.append(datas) labels.append(np.array(label)) idx += batch_size accuracy = 0 for i, data in enumerate(batches): ret = runner.Run([data]) # print("ret:", ret) accuracy += np.sum(labels[i] == ret["label"]) print("acc:", 1.0 * accuracy / len(dev_ds))
def main(): paddle.seed(42) args = parse_args() args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() predictor = Predictor.create_predictor(args) model_class, tokenizer_class = MODEL_CLASSES[args.model_type] dev_ds = load_dataset('clue', args.task_name, splits='dev') if not args.use_faster_tokenizer: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) else: trans_func = partial(convert_example, label_list=dev_ds.label_list, is_test=False) dev_ds = dev_ds.map(trans_func, lazy=True) if not args.use_faster_tokenizer: batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype="int64" if dev_ds.label_list else "float32") # label ): fn(samples) outputs = predictor.predict(dev_ds, tokenizer, batchify_fn, args) else: outputs = predictor.faster_predict(dev_ds, args=args)
def create_data_loader_for_small_model(task_name, vocab_path, model_name=None, batch_size=64, max_seq_length=128, shuffle=True): """Data loader for bi-lstm, not bert.""" if task_name == 'chnsenticorp': train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"]) else: train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"]) if task_name == 'chnsenticorp': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = BertTokenizer.from_pretrained(model_name) pad_val = vocab.pad_token_id trans_fn = partial(convert_small_example, task_name=task_name, vocab=vocab, max_seq_length=max_seq_length, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_val), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): fn(samples) train_ds = train_ds.map(trans_fn, lazy=True) dev_ds = dev_ds.map(trans_fn, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader
def get_train_dataloader(model, tokenizer, args): logger.info(f'Load data according to {args.multi_task_config} ...') dataset = load_dataset(read_training_instance_based_config, tokenizer=tokenizer, config_file=args.multi_task_config, max_source_length=args.max_source_length, lazy=False, negative_keep=args.negative_keep) # Merge schema in all datasets for pre-tokenize schema_list = list() for task_config in TaskConfig.load_list_from_yaml(args.multi_task_config): schema_file = os.path.join(task_config.data_path, "record.schema") schema_list += [RecordSchema.read_from_file(schema_file)] schema = merge_schema(schema_list) batch_sampler = DistributedBatchSampler( dataset=dataset, batch_size=args.per_device_train_batch_size, shuffle=True, ) if args.spot_noise > 0 or args.asoc_noise > 0: spot_asoc_nosier = SpotAsocNoiser( spot_noise_ratio=args.spot_noise, asoc_noise_ratio=args.asoc_noise, null_span=constants.null_span, ) else: spot_asoc_nosier = None label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id collate_fn = DataCollatorForMultiTaskSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, max_source_length=args.max_source_length, max_prefix_length=args.max_prefix_length, max_target_length=args.max_target_length, ssi_generator=DynamicSSIGenerator( tokenizer=tokenizer, schema=schema, positive_rate=args.meta_positive_rate, negative=args.meta_negative, ordered_prompt=args.ordered_prompt, ), spot_asoc_nosier=spot_asoc_nosier, ) data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=args.dataloader_num_workers, return_list=True) return data_loader
def do_eval(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] dev_ds = load_dataset('clue', args.task_name, splits='dev') tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, label_list=dev_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if dev_ds.label_list else "float32") # label ): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if dev_ds.label_list == None else len(dev_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() model.eval() metric.reset() for batch in dev_data_loader: input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) correct = metric.compute(logits, labels) metric.update(correct) res = metric.accumulate() print("acc: %s\n, " % (res), end='')
def write_csl(task_name, output_file, pred_labels): test_ds = load_dataset("fewclue", name=task_name, splits=("test")) test_example = {} with open(output_file, 'w', encoding='utf-8') as f: for idx, example in enumerate(test_ds): test_example["id"] = example["id"] test_example["label"] = pred_labels[idx] str_test_example = json.dumps(test_example) f.write(str_test_example + "\n")
def infer(args): paddle.set_device(args.device) set_seed(args.seed) model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) test_ds = load_dataset('duconv', splits='test_1') test_ds, test_data_loader = create_data_loader(test_ds, tokenizer, args, 'test') model.eval() total_time = 0.0 start_time = time.time() pred_responses = [] for step, inputs in enumerate(test_data_loader, 1): input_ids, token_type_ids, position_ids, attention_mask, seq_len = inputs output = model.generate(input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, seq_len=seq_len, max_length=args.max_dec_len, min_length=args.min_dec_len, decode_strategy=args.decode_strategy, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, early_stopping=args.early_stopping, num_return_sequences=args.num_return_sequences, use_fp16_decoding=args.use_fp16_decoding, use_faster=args.faster) total_time += (time.time() - start_time) if step % args.logging_steps == 0: print('step %d - %.3fs/step' % (step, total_time / args.logging_steps)) total_time = 0.0 ids, scores = output results = select_response(ids, scores, tokenizer, args.max_dec_len, args.num_return_sequences) pred_responses.extend(results) start_time = time.time() with open(args.output_path, 'w', encoding='utf-8') as fout: for response in pred_responses: fout.write(response + '\n') print('\nSave inference result into: %s' % args.output_path) target_responses = [example['response'] for example in test_ds] calc_bleu_and_distinct(pred_responses, target_responses)
def write_chid(task_name, output_file, pred_labels): test_ds = load_dataset("fewclue", name=task_name, splits=("test")) test_example = {} with open(output_file, 'w', encoding='utf-8') as f: for idx, example in enumerate(test_ds): test_example["id"] = example["id"] test_example["answer"] = pred_labels[idx] str_test_example = "\"{}\": {}, \"{}\": {}".format( "id", test_example['id'], "answer", test_example["answer"]) f.write("{" + str_test_example + "}\n")
def _create_dataloader(mode, tokenizer, max_encoder_length, pad_val=0): dataset = load_dataset("imdb", splits=mode) batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=(mode == "train")) data_loader = paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=_collate_data, return_list=True) return data_loader
def adapt_vocab_size(args): dataset = load_dataset('wmt14ende', splits=('test')) src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab))
def do_predict(args): place = paddle.set_device("gpu") paddle.seed(args.seed) tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, pad_to_max_seq_len=args.pad_to_max_seq_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment ): [data for data in fn(samples)] valid_ds = load_dataset(read_text_pair, data_path=args.text_pair_file, lazy=False) valid_data_loader = create_dataloader(valid_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) pretrained_model = ErnieModel.from_pretrained("ernie-1.0") model = SemanticIndexingPredictor(pretrained_model, args.output_emb_size, dropout=args.dropout, use_fp16=args.use_fp16) model.eval() model.load(args.params_path) model = enable_faster_encoder(model, use_fp16=args.use_fp16) cosine_sims = [] for batch_data in valid_data_loader: query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch_data query_input_ids = paddle.to_tensor(query_input_ids) query_token_type_ids = paddle.to_tensor(query_token_type_ids) title_input_ids = paddle.to_tensor(title_input_ids) title_token_type_ids = paddle.to_tensor(title_token_type_ids) batch_cosine_sim = model( query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids).numpy() cosine_sims.append(batch_cosine_sim) cosine_sims = np.concatenate(cosine_sims, axis=0) for cosine in cosine_sims: print('{}'.format(cosine)) model = disable_faster_encoder(model)
def create_data_loader(args, places=None): datasets = load_dataset('wmt14ende', splits=('train', 'dev')) if not args.benchmark: src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary( **datasets[0].vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target data_loaders = [(None)] * 2 for i, dataset in enumerate(datasets): dataset = dataset.map(convert_samples, lazy=False).filter( partial(min_max_filer, max_len=args.max_length)) batch_sampler = TransformerBatchSampler( dataset=dataset, batch_size=args.batch_size, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, use_token_batch=True, max_length=args.max_length, distribute_mode=True if i == 0 else False, world_size=dist.get_world_size(), rank=dist.get_rank(), pad_seq=args.pad_seq, bsz_multi=args.bsz_multi) data_loader = DataLoader(dataset=dataset, places=places, batch_sampler=batch_sampler, collate_fn=partial(prepare_train_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq), num_workers=0) data_loaders[i] = (data_loader) return data_loaders
def write_cluewsc(task_name, output_file, pred_labels): test_ds = load_dataset("fewclue", name=args.task_name, splits=("test")) test_example = {} with open(output_file, 'w', encoding='utf-8') as f: for idx, example in enumerate(test_ds): test_example["id"] = example["id"] test_example["label"] = pred_labels[idx] # {"id": 0, "label": "力学"} str_test_example = "\"{}\": {}, \"{}\": \"{}\"".format( "id", test_example['id'], "label", test_example["label"]) f.write("{" + str_test_example + "}\n")
def get_test_dataloader(args, language, batchify_fn, trans_func): test_ds = load_dataset("xnli", language, splits="test") test_ds = test_ds.map(trans_func, lazy=True) test_batch_sampler = BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) return test_data_loader
def do_eval(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] dev_ds = load_dataset('clue', args.task_name, splits='dev') tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, label_list=dev_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = DataCollatorWithPadding(tokenizer) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if dev_ds.label_list == None else len(dev_ds.label_list) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() model.eval() metric.reset() for batch in dev_data_loader: labels = batch.pop("labels") logits = model(**batch) correct = metric.compute(logits, labels) metric.update(correct) res = metric.accumulate() print("acc: %s\n, " % (res), end='')
def do_predict(args): paddle.set_device(args.device) args.task_name = args.task_name.lower() train_ds, test_ds = load_dataset( 'clue', args.task_name, splits=('train', 'test')) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length, is_test=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment ): fn(samples) test_ds = test_ds.map(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.task_name == 'ocnli': args.task_name = 'ocnli_50k' f = open( os.path.join(args.output_dir, args.task_name + "_predict.json"), 'w') for step, batch in enumerate(test_data_loader): input_ids, segment_ids = batch with paddle.no_grad(): logits = model(input_ids, segment_ids) preds = paddle.argmax(logits, axis=1) for idx, pred in enumerate(preds): j = json.dumps({"id": idx, "label": train_ds.label_list[pred]}) f.write(j + "\n")
def create_infer_loader(batch_size=128): test_ds = load_dataset('couplet', splits='test') vocab = Vocab.load_vocabulary(**test_ds.vocab_info) pad_id = vocab[vocab.eos_token] trans_func = partial(convert_example, vocab=vocab) test_ds = test_ds.map(trans_func, lazy=False) test_batch_sampler = SamplerHelper(test_ds).batch(batch_size=batch_size) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=partial(prepare_input, pad_id=pad_id)) return test_loader, vocab
def predict_ext(self, args): ori_test_ds = load_dataset(read_test_file, data_path=args.test_path, lazy=False) trans_func = partial(convert_example_to_feature_ext, tokenizer=self.tokenizer, label2id=self.ext_label2id, max_seq_len=args.ext_max_seq_len, is_test=True) test_ds = copy.copy(ori_test_ds).map(trans_func, lazy=False) batch_list = [ test_ds[idx:idx + args.batch_size] for idx in range(0, len(test_ds), args.batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64")): fn(samples) results = [] for bid, batch_data in enumerate(batch_list): input_ids, token_type_ids, seq_lens = batchify_fn(batch_data) self.ext_input_handles[0].copy_from_cpu(input_ids) self.ext_input_handles[1].copy_from_cpu(token_type_ids) self.ext_predictor.run() logits = self.ext_output_hanle.copy_to_cpu() predictions = logits.argmax(axis=2) for eid, (seq_len, prediction) in enumerate(zip(seq_lens, predictions)): idx = bid * args.batch_size + eid tag_seq = [ self.ext_id2label[idx] for idx in prediction[:seq_len][1:-1] ] text = ori_test_ds[idx]["text"] aps = decoding(text[:args.ext_max_seq_len - 2], tag_seq) for aid, ap in enumerate(aps): aspect, opinions = ap[0], list(set(ap[1:])) aspect_text = self._concate_aspect_and_opinion( text, aspect, opinions) results.append({ "id": str(idx) + "_" + str(aid), "aspect": aspect, "opinions": opinions, "text": text, "aspect_text": aspect_text }) return results
def test_train_set(self): expected_ds_num = 2 expected_len = 25000 expected_train_text, expected_train_label = get_examples('train') expected_test_text, expected_test_label = get_examples('test') ds = load_dataset(**self.config) self.check_output_equal(len(ds), expected_ds_num) self.check_output_equal(len(ds[0]), expected_len) self.check_output_equal(len(ds[1]), expected_len) self.check_output_equal(expected_train_text, ds[0][36]['text']) self.check_output_equal(expected_train_label, ds[0][36]['label']) self.check_output_equal(expected_test_text, ds[1][23]['text']) self.check_output_equal(expected_test_label, ds[1][23]['label'])