def make_prediction(Description_A, Description_B, model_path, USE_GPU=True): # Loading tokenized using a stored Pickle Object, as it is more reliable # In case you'd like to create the object, you can do so here: bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased", do_lower_case=True) pickle_off = open("tokenizer.pkl", "rb") tokenizer = pickle.load(pickle_off) # Tokenizes the words into format required by Bert ex: "I am playing" -> ["I","am","play","##ing"] hypothesis = tokenizer.tokenize(Description_A) #If sequence is too long it truncates it to ensure it fits into BERT's max seq len and changes the words into numbers if len(hypothesis) > 512 - 3: hypothesis = hypothesis[:512 - 3] input_ids = tokenizer.convert_tokens_to_ids( ['[CLS]'] + hypothesis + ['[SEP]']) #Determnines what sentence it's in, doesn't really matter for single sentence, but important for 2 sentence classification type_ids = [0] * (len(hypothesis) + 2) #Concatenates all the important labels into a dictionary #UID : id number (no importance) ; label: "ground truth" (no importance when making a prediction) # token_id: representation of words ; type_id: position within a sentence features = {'uid': 0, 'label': 0, 'token_id': input_ids, 'type_id': type_ids} # Loads data into a BatchGen object which is needed for making a prediction, nothing needed to change here dev_data = BatchGen([features], batch_size=8, gpu=True, is_train=False, task_id=0, maxlen=512, pairwise=False, data_type=0, task_type=0) # function to convert token ids back to words print(tokenizer.convert_ids_to_tokens([101, 100, 5208, 2024, 17662, 9119, 2096, 3173, 2000, 2175, 14555, 2044, 2074, 5983, 6265, 1012, 102, 100, 2308, 2024, 23581, 2096, 3173, 2000, 2175, 14555, 1012, 102])) #hyper parameters: whatever is necessary is added as variables at the top opt = {'init_checkpoint': model_path, 'data_dir': 'data/domain_adaptation', 'data_sort_on': False, 'name': 'farmer', 'train_datasets': ['sst'], 'test_datasets': ['sst'], 'pw_tasks': ['qnnli'], 'update_bert_opt': 0, 'multi_gpu_on': False, 'mem_cum_type': 'simple', 'answer_num_turn': 5, 'answer_mem_drop_p': 0.1, 'answer_att_hidden_size': 128, 'answer_att_type': 'bilinear', 'answer_rnn_type': 'gru', 'answer_sum_att_type': 'bilinear', 'answer_merge_opt': 1, 'answer_mem_type': 1, 'answer_dropout_p': 0.1, 'answer_weight_norm_on': False, 'dump_state_on': False, 'answer_opt': [ 0], 'label_size': '2', 'mtl_opt': 0, 'ratio': 0, 'mix_opt': 0, 'max_seq_len': 512, 'init_ratio': 1, 'cuda': USE_GPU, 'log_per_updates': 500, 'epochs': 5, 'batch_size': 32, 'batch_size_eval': 8, 'optimizer': 'adamax', 'grad_clipping': 0.0, 'global_grad_clipping': 1.0, 'weight_decay': 0, 'learning_rate': 5e-05, 'momentum': 0, 'warmup': 0.1, 'warmup_schedule': 'warmup_linear', 'vb_dropout': True, 'dropout_p': 0.1, 'dropout_w': 0.0, 'bert_dropout_p': 0.1, 'ema_opt': 0, 'ema_gamma': 0.995, 'have_lr_scheduler': True, 'multi_step_lr': '10,20,30', 'freeze_layers': -1, 'embedding_opt': 0, 'lr_gamma': 0.5, 'bert_l2norm': 0.0, 'scheduler_type': 'ms', 'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19', 'seed': 2018, 'task_config_path': 'configs/tasks_config.json', 'tasks_dropout_p': [0.1]} state_dict = torch.load(model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = 0.1 config['hidden_dropout_prob'] = 0.1 opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50) #actual prediction to be made: main outputs are predictions which is a list of size 1, and scores which is confidence in prediction for each class dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, 0,use_cuda=True, with_label =False) #model, data, metric_meta, use_cuda=True, with_label=True return dev_predictions, scores
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in task_defs.n_class_map assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] nclass = task_defs.n_class_map[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p) dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) train_data_list.append(train_data) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[ task_defs. n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_iters = [iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] # div number of grad accumulation. num_all_batches = args.epochs * sum( all_lens) // args.grad_accumulation_step logger.info('############# Gradient Accumulation Info #############') logger.info('number of step: {}'.format(args.epochs * sum(all_lens))) logger.info('number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) logger.info('adjusted number of step: {}'.format(num_all_batches)) logger.info('############# Gradient Accumulation Info #############') if len(train_data_list) > 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) bert_model_path = args.init_checkpoint state_dict = None if encoder_type == EncoderModelType.BERT: if os.path.exists(bert_model_path): state_dict = torch.load(bert_model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!' ) logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() opt.update(config) elif encoder_type == EncoderModelType.ROBERTA: bert_model_path = '{}/model.pt'.format(bert_model_path) if os.path.exists(bert_model_path): new_state_dict = {} state_dict = torch.load(bert_model_path) for key, val in state_dict['model'].items(): if key.startswith('decoder.sentence_encoder'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val elif key.startswith('classification_heads'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val state_dict = {'state': new_state_dict} model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) for train_data in train_data_list: train_data.reset() start = datetime.now() all_indices = [] if len(train_data_list) > 1 and args.ratio > 0: main_indices = [0] * len(train_data_list[0]) extra_indices = [] for i in range(1, len(train_data_list)): extra_indices += [i] * len(train_data_list[i]) random_picks = int( min(len(train_data_list[0]) * args.ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False) if args.mix_opt > 0: extra_indices = extra_indices.tolist() random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices.tolist() else: for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data = next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.local_updates) % (args.log_per_updates * args.grad_accumulation_step ) == 0 or model.local_updates == 1: ramaining_time = str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0] logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format(task_id, model.updates, model.train_loss.avg, ramaining_time)) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0): model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) logger.info('Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = task_defs.global_map.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda) for key, val in dev_metrics.items(): logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda, with_label=False) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file)
def get_confidence(words, chars): dev_data = BatchGen(words, is_train=False, gpu=True, batch_size=1, maxlen=512) dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, 0, use_cuda=True, with_label = False) return max(scores)
# load data test_data = BatchGen(BatchGen.load(args.prep_input, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=args.task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) # load model checkpoint_path = args.checkpoint assert os.path.exists(checkpoint_path) if args.cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location="cpu") config = state_dict['config'] config["cuda"] = args.cuda model = MTDNNModel(config, state_dict=state_dict) test_metrics, test_predictions, scores, golds, test_ids = eval_model(model, test_data, metric_meta=metric_meta, use_cuda=args.cuda, with_label=args.with_label) results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores} dump(args.score, results) if args.with_label: print(test_metrics)