def evaluate(args, model, tokenizer, processor, label_list, device, mode="test"): num_labels = len(label_list) + 1 eval_data = load_examples(args, tokenizer, processor, label_list, mode) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} label_map[0] = 'unknown' nb_tr_examples, nb_tr_steps = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, label_mask, b_use_valid_filter,\ adj_matrix, dep_matrix = batch with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, valid_ids=valid_ids, adjacency_matrix=adj_matrix) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.detach().cpu().numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == num_labels - 1: y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) logger.info("nb_tr_examples: {}, nb_tr_steps: {}".format(nb_tr_examples, nb_tr_steps)) result = evaluate_ote(y_true, y_pred) logging.info(result) return { "precision": result[0], "recall": result[1], "f1": result[2] }
def train(data_dir='data/memes/', dim_proj=512, maxlen=30, batch_size=256, keep_ratio=1., shuffle_data=True, learning_rate=0.001, global_steps=50000, disp_freq=100, save_freq=1000, test_freq=1000, saveto_file='params.npz', weight_decay=0.0005, reload_model=False, train=True): """ Topo-LSTM model training. """ options = locals().copy() saveto = data_dir + saveto_file # loads graph G, node_index = data_utils.load_graph(data_dir) print nx.info(G) options['n_words'] = len(node_index) print options # creates and initializes shared variables. print 'Initializing variables...' params = init_params(options) if reload_model: print 'reusing saved model.' load_params(saveto, params) tparams = init_tparams(params) # builds Topo-LSTM model print 'Building model...' model = tprnn_model.build_model(tparams, options) print 'Loading test data...' test_examples = data_utils.load_examples(data_dir, dataset='test', node_index=node_index, maxlen=maxlen, G=G) test_loader = data_utils.Loader(test_examples, options=options) print 'Loaded %d test examples' % len(test_examples) if train: # prepares training data. print 'Loading train data...' train_examples = data_utils.load_examples( data_dir, dataset='train', keep_ratio=options['keep_ratio'], node_index=node_index, maxlen=maxlen, G=G) train_loader = data_utils.Loader(train_examples, options=options) print 'Loaded %d training examples.' % len(train_examples) # compiles updates. optimizer = downhill.build(algo='adam', loss=model['cost'], params=tparams.values(), inputs=model['data']) updates = optimizer.get_updates(max_gradient_elem=5., learning_rate=learning_rate) f_update = theano.function(model['data'], model['cost'], updates=list(updates)) # training loop. start_time = timeit.default_timer() # downhill.minimize( # loss=cost, # algo='adam', # train=train_loader, # # inputs=input_list + [labels], # # params=tparams.values(), # # patience=0, # max_gradient_clip=1, # # max_gradient_norm=1, # learning_rate=learning_rate, # monitors=[('cost', cost)], # monitor_gradients=False) n_examples = len(train_examples) batches_per_epoch = n_examples // options['batch_size'] + 1 n_epochs = global_steps // batches_per_epoch + 1 global_step = 0 cost_history = [] for _ in range(n_epochs): for _ in range(batches_per_epoch): cost = f_update(*train_loader()) cost_history += [cost] if global_step % disp_freq == 0: print 'global step %d, cost: %f' % (global_step, cost) # dump model parameters. if global_step % save_freq == 0: params = unzip(tparams) np.savez(saveto, **params) pickle.dump(options, open('%s.pkl' % saveto, 'wb'), -1) # evaluate on test data. if global_step % test_freq == 0: scores = evaluate(model['f_prob'], test_loader) print 'eval scores: ', scores end_time = timeit.default_timer() print 'time used: %d seconds.' % (end_time - start_time) global_step += 1 scores = evaluate(model['f_prob'], test_loader) pprint.pprint(scores)
def train_pet(args) -> List: # Load configs model_config, train_config, eval_config = load_pet_configs(args) # Load dataset train_data = load_examples(args.task_name, args.data_dir, TRAIN_SET, num_examples=args.train_examples, split_examples_evenly=args.split_examples_evenly) eval_data = load_examples(args.task_name, args.data_dir, TEST_SET if args.eval_set == 'test' else DEV_SET, num_examples=args.eval_examples, split_examples_evenly=args.split_examples_evenly) dev_data = load_examples(args.task_name, args.data_dir, DEV32_SET, num_examples=args.dev_examples, split_examples_evenly=args.split_examples_evenly) set_seed(args.seed) # Record all evaluation results on dev & eval set all_result = [] dev_result_all = defaultdict(lambda: defaultdict(list)) eval_result_all = defaultdict(lambda: defaultdict(list)) # In 2 stage training, the 1st stage evaluations should also be recorded if args.do_train and args.do_eval and args.two_stage_train: dev_stage1_all = defaultdict(lambda: defaultdict(list)) eval_stage1_all = defaultdict(lambda: defaultdict(list)) # Iterates through all patterns for pattern_id in args.pattern_ids: # Repeat training for iteration in range(args.pet_repetitions): results_dict = {} model_config.pattern_id = pattern_id pattern_iter_output_dir = "{}/p{}-i{}".format( args.output_dir, pattern_id, iteration) if os.path.exists(pattern_iter_output_dir): logger.warning( f"Path {pattern_iter_output_dir} already exists, skipping it...") continue os.makedirs(pattern_iter_output_dir) # Init wrapper model assert model_config.pattern_id is not None, 'A pattern_id must be set for initializing a new PET model' wrapper = TransformerModelWrapper(model_config) # Training logger.info('--- Start iteration %d ---' % iteration) if args.do_train: if not args.two_stage_train: # Single stage training logger.info('=== Start training ===') results_dict.update(train_single_model(train_data, eval_data, dev_data, pattern_iter_output_dir, wrapper, train_config, eval_config)) evaluate_single_model(pattern_id, pattern_iter_output_dir, eval_data, dev_data, eval_config, results_dict, dev_result_all, eval_result_all) with open(os.path.join(pattern_iter_output_dir, 'results.json'), 'w') as fh: json.dump(results_dict, fh) else: # Two stage training # 1. Only train prompts and label tokens logger.info('=== Start training stage 1 ===') results_dict.update(train_single_model(train_data, eval_data, dev_data, pattern_iter_output_dir, wrapper, train_config, eval_config, stage=1)) evaluate_single_model(pattern_id, pattern_iter_output_dir, eval_data, dev_data, eval_config, results_dict, dev_stage1_all, eval_stage1_all) with open(os.path.join(pattern_iter_output_dir, 'results_stage1.json'), 'w') as fh: json.dump(results_dict, fh) # 2. Train full model logger.info('=== Start training stage 2 ===') results_dict.update(train_single_model(train_data, eval_data, dev_data, pattern_iter_output_dir, wrapper, train_config, eval_config, stage=2)) evaluate_single_model(pattern_id, pattern_iter_output_dir, eval_data, dev_data, eval_config, results_dict, dev_result_all, eval_result_all) with open(os.path.join(pattern_iter_output_dir, 'results.json'), 'w') as fh: json.dump(results_dict, fh) # Save configs train_config.save(os.path.join( pattern_iter_output_dir, 'train_config.json')) eval_config.save(os.path.join( pattern_iter_output_dir, 'eval_config.json')) logger.info("Saving complete") # Do evaluation only elif args.do_eval: evaluate_single_model(pattern_id, pattern_iter_output_dir, eval_data, dev_data, eval_config, results_dict, dev_result_all, eval_result_all) # Write overall results with open(os.path.join(pattern_iter_output_dir, 'results.json'), 'w') as fh: json.dump(results_dict, fh) # Clear cache wrapper.model = None wrapper = None torch.cuda.empty_cache() # Calculate average results of current pattern if args.do_eval: logger.info("=== OVERALL RESULTS ===") if args.do_train and args.do_eval and args.two_stage_train: # Store stage 1 results first all_result.extend(write_results(os.path.join( args.output_dir, 'result_stage1.txt'), dev_stage1_all, eval_stage1_all)) all_result.extend(write_results(os.path.join( args.output_dir, 'result.txt'), dev_result_all, eval_result_all)) return all_result
def train(data_dir='data/memes/', dim_proj=256, dim_att=128, maxlen=30, batch_size=256, keep_ratio=1., shuffle_data=True, learning_rate=0.001, global_steps=50000, disp_freq=100, save_freq=100, test_freq=100, saveto_file='params.npz', tmsaveto_file='timeparams.npz', weight_decay=0.0005, sigmasqr=1, tdim=1., reload_model=False, train=True): """ Topo-LSTM model training. tdim: scale time down by how many times """ options = locals().copy() #savedstep = '0' saveto = data_dir + saveto_file tmsaveto = data_dir + tmsaveto_file # loads graph Gp, node_index = data_utils.load_graph(data_dir) #print nx.info(G) options['n_events'] = len(node_index) print options # creates and initializes shared variables. print 'Initializing variables...' params = init_params(options) if reload_model: print 'reusing saved model.' load_params(saveto, params) tparams = init_tparams(params) timeparams = init_timeparams(options) if reload_model: print 'reusing saved model.' load_params(tmsaveto, timeparams) timetparams = init_tparams(timeparams) # builds Topo-LSTM model print 'Building model...' model = tpgru_model.build_model(tparams, timetparams, options) print 'Loading test data...' test_examples = data_utils.load_examples(data_dir, dataset='test', node_index=node_index, maxlen=maxlen, Gp=Gp) test_loader = data_utils.Loader(test_examples, options=options) print 'Loaded %d test examples' % len(test_examples) if train: # prepares training data. print 'Loading train data...' train_examples = data_utils.load_examples( data_dir, dataset='train', keep_ratio=options['keep_ratio'], node_index=node_index, maxlen=maxlen, Gp=Gp) train_loader = data_utils.Loader(train_examples, options=options) print 'Loaded %d training examples.' % len(train_examples) # compiles updates. optimizer = downhill.build(algo='adam', loss=model['cost'], params=tparams.values(), inputs=model['data']) updates = optimizer.get_updates(max_gradient_elem=5., learning_rate=learning_rate) f_update = theano.function(model['data'], model['cost'], updates=list(updates)) toptimizer = downhill.build(algo='adam', loss=model['timecost'], params=timetparams.values(), inputs=model['timedata']) tupdates = toptimizer.get_updates(max_gradient_elem=5., learning_rate=0.005) f_t_update = theano.function(model['timedata'], model['timecost'], updates=list(tupdates)) # training loop. start_time = timeit.default_timer() n_examples = len(train_examples) batches_per_epoch = n_examples // options['batch_size'] + 1 n_epochs = global_steps // batches_per_epoch + 1 global_step = 0 #cost_history = [] for _ in range(n_epochs): for _ in range(batches_per_epoch): batch_data = train_loader() cost = f_update(*(batch_data[:-3] + (batch_data[-2], ))) #cost_history += [cost] timecost = f_t_update(*(batch_data[:-2] + (batch_data[-1], ))) if global_step % disp_freq == 0: print 'global step %d, cost: %f' % (global_step, cost) print 'timecost: %f' % (timecost) # dump model parameters. if global_step % save_freq == 0: params = unzip(tparams) np.savez(data_dir + saveto_file, **params) pickle.dump( options, open('%s.pkl' % (data_dir + saveto_file), 'wb'), -1) timeparams = unzip(timetparams) np.savez(data_dir + tmsaveto_file, **timeparams) # evaluate on test data. if global_step % test_freq == 0: scores = evaluate(model['f_prob'], test_loader, model['f_tprob'], options['tdim']) print 'eval scores: ', scores end_time = timeit.default_timer() print 'time used: %d seconds.' % (end_time - start_time) global_step += 1 scores = evaluate(model['f_prob'], test_loader, model['f_tprob'], options['tdim']) pprint.pprint(scores)
def main_train(args, model, tokenizer, processor, label_list, device, n_gpu): train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: print("using fp16") try: from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False) if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps) else: print("using fp32") optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 average_loss = 0 print("data prep") train_data = load_examples(args, tokenizer, processor, label_list, "train") if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() nb_tr_examples = 0 for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"): if args.max_steps > 0 and global_step > args.max_steps: break for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, label_mask, b_use_valid_filter, \ adj_matrix, dep_matrix = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, valid_ids=valid_ids, adjacency_matrix=adj_matrix) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() average_loss += loss nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if args.fp16: # modify learning rate with special warm up for BERT which FusedAdam doesn't do scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 logging.info("Global Steps:{} Final Loss = {}".format(global_step, average_loss)) average_loss = 0 if args.local_rank == -1 or torch.distributed.get_rank() == 0 or args.world_size <= 1: # Save model checkpoint output_dir = os.path.join(args.output_dir, "epoch-{}".format(epoch_num)) if not os.path.exists(output_dir): os.makedirs(output_dir) save_zen_model(output_dir, model, args) loss = tr_loss / nb_tr_steps if args.do_train else None return loss, global_step