def test(args): from dataio.task_data import TaskData from predict.predictor import Predictor data = TaskData(args.test_data_num) labels, sents = data.read_data( raw_data_path=config["test_path"], data_dir=config["data_dir"], preprocessor=Preprocessor(config["preprocessor"])( stopwords_path=config["stopwords_path"], userdict_path=config["userdict_path"]), is_train=False) lines = list(zip(sents, labels)) processor = Postprocessor( config["postprocessor"])(do_lower_case=args.do_lower_case) label_list = processor.get_labels(config['data_dir'] / "labels.txt") id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples( lines=test_data, example_type='test', cached_examples_file=config['data_dir'] / "cached_test_examples_{}".format(args.pretrain)) test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_test_features_{}_{}".format(args.eval_max_seq_len, args.pretrain)) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size) if config["pretrain"] == "Nopretrain": config["vocab_size"] = processor.vocab_size model = Classifier(config["classifier"], config["pretrain"], config["checkpoint_dir"])(num_labels=len(label_list)) ########### predict ########### logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) logits, y_pred = predictor.predict(data=test_dataloader, thresh=0.5) pred_labels = [] for item in y_pred.tolist(): tmp = [] for i, v in enumerate(item): if v == 1: tmp.append(label_list[i]) pred_labels.append(",".join(tmp)) assert len(pred_labels) == y_pred.shape[0] df_pred_labels = pd.DataFrame(pred_labels, columns=["predict_labels"]) df_test_raw = pd.read_csv(config["test_path"]) if args.test_data_num > 0: df_test_raw = df_test_raw.head(args.test_data_num) df_labels = pd.DataFrame(logits, columns=label_list) df = pd.concat([df_test_raw, df_pred_labels, df_labels], axis=1) df.to_csv(config["result"] / "output.csv", index=False)
from predict.predictor import Predictor from time import time import json # Read model definition model_definition = ModelDefinition() print("Reading exported model") predictor = Predictor(model_definition) # Sample input: First file word (sequence with all pad elements) input = predictor.get_empty_element() print(input) json_test = json.dumps(input) print("Prediction:", predictor.predict(input)) n_repetitions = 1000 print("Testing performance, n. repetitions:", n_repetitions) start = time() for i in range(n_repetitions): predictor.predict_json(json_test) end = time() print("Total time:", end - start, "s") print("Prediction performance:", ((end - start) / n_repetitions) * 1000, "ms") # RNN: House computer (Linux): # seq_len = 16, rnn size = 64 -> With GPU: 1.7 ms / With CPU: 0.85 ms # seq_len= 64, rnn_size = 256 -> With GPU: 2.76 ms / With CPU: 4.24 ms # RNN: Work computer (Windows 10):