def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) #if args.save_and_break: # print("save model just after init and exit") # snapshot.save("initial_snapshot") # import sys # sys.exit() for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get(metric.metric_cb(step, epoch=epoch)) #if 1: #step % args.loss_print_every_n_iter == 0: run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) summary = Summary(args.log_dir, args) if args.do_train: print('| Training Start') for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') if args.save_last_snapshot: snapshot.save("last_snapshot") if args.do_eval: print('| Evaluation Start') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) if args.do_train or args.do_eval: snapshot = Snapshot(args.model_save_dir, args.model_load_dir) if args.do_train: summary = Summary(args.log_dir, args) for epoch in range(args.num_epochs): metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['total_loss']) for step in range(epoch_size): SquadFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) if args.save_last_snapshot: snapshot.save("last_snapshot") if args.do_eval: assert os.path.isdir(args.eval_data_dir) all_results = [] for step in range(num_eval_steps): unique_ids, start_positions, end_positions = SquadDevJob().get() unique_ids = unique_ids.numpy() start_positions = start_positions.numpy() end_positions = end_positions.numpy() for unique_id, start_position, end_position in zip( unique_ids, start_positions, end_positions): all_results.append( RawResult( unique_id=int(unique_id[0]), start_logits=start_position.flatten().tolist(), end_logits=end_position.flatten().tolist(), )) if step % args.loss_print_every_n_iter == 0: print("{}/{}, num of results:{}".format( step, num_eval_steps, len(all_results))) print("last uid:", unique_id[0]) gen_eval_predict_json(args, all_results)
def main(): InitNodes(args) assert args.model_load_dir, "Must have model load dir!" flow.env.log_dir(args.log_dir) # snapshot = Snapshot(args.model_save_dir, args.model_load_dir) print("Restoring model from {}.".format(args.model_load_dir)) flow.load_variables(flow.checkpoint.get(args.model_load_dir)) metric = Metric(desc="validation", calculate_batches=num_val_steps, batch_size=val_batch_size) for i in range(args.num_epochs): for j in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(0, j))
def main(self): processed_list, alphabet, _, emb_dim = pkl.load( open(self.config['res_path'].format(self.config['dataset']), 'rb')) if isinstance(processed_list, dict): processed_list = [processed_list] scores = [] for data_list in processed_list: train_data = MyDatasetLoader(self.config, data_list, 'train').get_data() valid_data = MyDatasetLoader(self.config, data_list, 'valid').get_data() test_data = MyDatasetLoader(self.config, data_list, 'test').get_data() self.model = TextCNN(self.config, alphabet, emb_dim, self.device).to(self.device) for w in self.model.parameters(): print(w.shape, w.requires_grad) self.optimizer = Adam(filter(lambda x: x.requires_grad, self.model.parameters()), lr=self.config['lr'], weight_decay=float(self.config['l2']), eps=float(self.config['esp'])) self.metircs = Metric() score = self.forward(train_data, valid_data, test_data) scores.append(score) print('| valid best | global best|') print('| --- | --- |') for w in scores: print("| {:.4f} | {:.4f} |".format(w[0], w[1])) if len(scores) > 1: print("valid Avg\tglobal Avg") print("| {:.4f} | {:.4f} |".format(np.mean([w[0] for w in scores]), np.mean([w[1] for w in scores])))
def train(model, loader, criterion, optimizer, epoch, device, opt): model.train() train_loss = 0.0 losses = AverageMeter() metric = Metric(opt.num_classes) for i, (imgs, spatial_locations, word_vectors, targets_predicates, targets_confidences) in enumerate(loader): # compute outputs imgs, spatial_locations, word_vectors, targets_confidences, targets_predicates = imgs.to( device), spatial_locations.to(device), word_vectors.to( device), targets_confidences.to(device), targets_predicates.to( device) confidences, predicates = model(imgs, spatial_locations, word_vectors) # compute loss loss1 = criterion(confidences, targets_confidences) loss2 = criterion(predicates, targets_predicates) tot_loss = loss1 + loss2 train_loss += tot_loss.item() losses.update(tot_loss.item(), imgs.size(0)) predicates = torch.sigmoid(predicates) metric.update(predicates, targets_predicates) optimizer.zero_grad() tot_loss.backward() optimizer.step() # show information if (i + 1) % opt.log_interval == 0: avg_loss = train_loss / opt.log_interval print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, losses.count, len(loader.dataset), 100. * (i + 1) / len(loader), avg_loss)) train_loss = 0.0 # show information recall = metric.compute_metrics() print('Train set ({:d} samples): Average loss: {:.4f}\tRecall: {:.4f}'. format(losses.count, losses.avg, recall)) return losses.avg, recall
def main(): InitNodes(args) assert args.model_load_dir, 'Must have model load dir!' flow.env.log_dir(args.log_dir) summary = Summary(args.log_dir, args) # snapshot = Snapshot(args.model_save_dir, args.model_load_dir) print("Restoring model from {}.".format(args.model_load_dir)) checkpoint = flow.train.CheckPoint() checkpoint.load(args.model_load_dir) metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(args.num_epochs): for j in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(0, j))
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, batch_size=batch_size, keys=['total_loss', 'mlm_loss', 'nsp_loss']) for step in range(args.iter_num): PretrainJob().async_get(metric.metric_cb(step)) #PretrainJob().async_get(metric.metric_cb(step, epoch=3)) if (step + 1) % args.model_save_every_n_iter == 0: snapshot.save("snapshot_%d" % (step + 1)) if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): InitNodes(args) flow.env.log_dir(args.log_dir) snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.save_init) print(" {} iter per epoch...".format(epoch_size)) for epoch in range(1, args.num_epochs + 1): metric = Metric( desc="train", calculate_batches=args.loss_print_every_n_iter, batch_size=train_batch_size, loss_key="loss", ) for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) if args.val_data_dir: metric = Metric( desc="validation", calculate_batches=num_val_steps, batch_size=val_batch_size, ) for i in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(epoch, i)) if epoch % args.save_epoch_interval == 0: snapshot.save("epoch_{}".format(epoch)) if args.save_last: snapshot.save("epoch_{}".format("last"))
def main(): InitNodes(args) flow.env.grpc_use_no_signal() flow.env.log_dir(args.log_dir) summary = Summary(args.log_dir, args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) for epoch in range(args.num_epochs): metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, summary=summary, save_summary_steps=epoch_size, batch_size=train_batch_size, loss_key='loss') for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) if args.val_data_dir: metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(epoch, i)) snapshot.save('epoch_{}'.format(epoch))
def validate(model, loader, criterion, epoch, device, opt): model.eval() losses = AverageMeter() metric = Metric(opt.num_classes) with torch.no_grad(): for i, (imgs, spatial_locations, word_vectors, targets_predicates, targets_confidences) in enumerate(loader): # compute outputs imgs, spatial_locations, word_vectors, targets_confidences, targets_predicates = imgs.to(device), spatial_locations.to( device), word_vectors.to(device), targets_confidences.to(device), targets_predicates.to(device) confidences, predicates = model(imgs, spatial_locations, word_vectors) # compute loss loss = criterion(predicates, targets_predicates) metric.update(predicates, targets_predicates) losses.update(loss.item(), imgs.size(0)) # show information recall = metric.compute_metrics() print('Validation set ({:d} samples): Average loss: {:.4f}\tRecall: {:.4f}'.format(losses.count, losses.avg, recall)) return losses.avg, recall
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init) print("num_accumulation_steps:", args.num_accumulation_steps) metric = Metric( desc="train", print_steps=args.loss_print_every_n_iter, batch_size=batch_size * args.num_accumulation_steps, keys=["total_loss", "mlm_loss", "nsp_loss"], ) for step in range(args.iter_num): PretrainJob().async_get(metric.metric_cb(step)) # PretrainJob().async_get(metric.metric_cb(step, epoch=3)) if (step + 1) % args.model_save_every_n_iter == 0: snapshot.save("snapshot_%d" % (step + 1)) if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) if args.do_train: snapshot = Snapshot(args.model_save_dir, args.model_load_dir) summary = Summary(args.log_dir, args) best_dev_acc = 0.0 best_result = {} for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) #if 1: #step % args.loss_print_every_n_iter == 0: run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') result = run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] best_result = result save_model = True print('Best result:', result) # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # best_result = result # save_model = True #print('Best result:', result) if task_name in mcc_tasks and result[ 'matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] best_result = result save_model = True print('Best result:', result) if save_model: if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) # snapshot_save_path = os.path.join(args.model_save_dir) # print("Saving best model to {}".format(snapshot_save_path)) snapshot.save('best') flow.sync_default_session() print('Best result:', best_result) print("Saving best model to " + os.path.join(args.model_save_dir, 'snapshot_best')) if args.serve_for_online: print('Deleting the optimizer parmas from model_save_dir...') remove_optimizer_params( os.path.join(args.model_save_dir, 'snapshot_best')) # if args.save_last_snapshot: # snapshot.save("last_snapshot") if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point = flow.train.CheckPoint() check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
parser = configs.get_parser() args = parser.parse_args() configs.print_args(args) flow.config.gpu_device_num(args.gpu_num_per_node) flow.config.enable_debug_mode(True) @flow.global_function(get_val_config(args)) def IOTest(): if args.train_data_dir: assert os.path.exists(args.train_data_dir) print("Loading data from {}".format(args.train_data_dir)) (labels, images) = load_imagenet_for_training(args) else: print("Loading synthetic data.") (labels, images) = load_synthetic(args) outputs = {"images": images, "labels": labels} return outputs total_device_num = args.num_nodes * args.gpu_num_per_node train_batch_size = total_device_num * args.batch_size_per_device summary = Summary(args.log_dir, args, filename='io_test.csv') metric = Metric(desc='io_test', calculate_batches=args.loss_print_every_n_iter, summary=summary, save_summary_steps=args.loss_print_every_n_iter, batch_size=train_batch_size, prediction_key=None) for i in range(1000): IOTest().async_get(metric.metric_cb(0, i))
from job_function_util import get_val_config parser = configs.get_parser() args = parser.parse_args() configs.print_args(args) flow.config.gpu_device_num(args.gpu_num_per_node) # flow.config.enable_debug_mode(True) @flow.global_function(get_val_config(args)) def IOTest(): if args.train_data_dir: assert os.path.exists(args.train_data_dir) print("Loading data from {}".format(args.train_data_dir)) (labels, images) = load_imagenet_for_training(args) else: print("Loading synthetic data.") (labels, images) = load_synthetic(args) outputs = {"images": images, "labels": labels} return outputs total_device_num = args.num_nodes * args.gpu_num_per_node train_batch_size = total_device_num * args.batch_size_per_device metric = Metric( desc="io_test", calculate_batches=args.loss_print_every_n_iter, batch_size=train_batch_size, prediction_key=None, ) for i in range(1000): IOTest().async_get(metric.metric_cb(0, i))
tmp = os.path.getsize(os.path.join(root, name)) size += tmp # size += sum([os.path.getsize(os.path.join(root, name)) for name in files]) return size def main(): InitNodes(args) flow.env.log_dir(args.log_dir) modelSize = getdirsize(args.model_load_dir) summary = Summary(args.log_dir, args, modelSize) >>>>>>> tianshu snapshot = Snapshot(args.model_save_dir, args.model_load_dir) for epoch in range(args.num_epochs): metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, summary=summary, save_summary_steps=epoch_size, batch_size=train_batch_size, loss_key='loss') for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) <<<<<<< HEAD # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration ======= # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration >>>>>>> tianshu if args.val_data_dir: metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(val_batch_size): <<<<<<< HEAD # if i<=10:
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) check_point = flow.train.CheckPoint() summary = Summary(args.log_dir, args) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) if args.do_train: print('Combining two models into one dir') if not os.path.exists('./tmp'): os.makedirs('./tmp') args.total_model = tempfile.mkdtemp(dir='./tmp') CopyFile(args.student_model, args.total_model) CopyFile(args.teacher_model, args.total_model) print('Loading model...') check_point.load(args.total_model) # # check_point.load(args.teacher_model) # # check_point.load(args.student_model) # print('Start training...') global_step = 0 best_dev_acc = 0.0 for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) global_step += 1 # if (global_step + 1) % args.model_save_every_n_iter == 0: # if not os.path.exists(args.model_save_dir): # os.makedirs(args.model_save_dir) # snapshot_save_path = os.path.join( # args.model_save_dir, "snapshot_%d" % (global_step + 1) # ) # print("Saving model to {}.".format(snapshot_save_path)) # check_point.save(snapshot_save_path) # if args.pred_distill: print('EvalTrainJob...') run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') print('EvalValJob...') result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') if not args.pred_distill: save_model = True else: save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] save_model = True # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # save_model = True if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] save_model = True print('Best result:', result) if save_model: if os.path.exists(args.model_save_dir): import shutil shutil.rmtree(args.model_save_dir) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) snapshot_save_path = os.path.join(args.model_save_dir) print("Saving best model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) flow.sync_default_session() if args.save_last_snapshot: snapshot_save_path = args.model_save_dir if os.path.exists(args.model_save_dir): import shutil shutil.rmtree(args.model_save_dir) print("Saving model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) flow.sync_default_session() if global_step >= 100: # remove tmp total models print('Removing the tmp models...') import shutil shutil.rmtree(args.total_model) if args.serve_for_online: print('Deleting the teacher params and the optimizer parmas from model_save_dir...') remove_teacher_params(args.model_save_dir) if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
def main(): flow.config.enable_debug_mode(True) flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) check_point = flow.train.CheckPoint() check_point.init() summary = Summary(args.log_dir, args) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) if args.do_train: print('Start training...') global_step = 0 best_dev_acc = 0.0 print('epoch_size:', epoch_size) print('args.iter_num:', args.iter_num) for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): loss = DistilJob().get() if step % 10 == 0: print('step/epoch_size:{}/{} epoch:{}'.format( step, epoch_size, epoch)) print('loss:', loss['loss'].mean()) # global_step+=1 # DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) print('EvalTrainJob...') run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') print('EvalValJob...') result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] save_model = True # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # save_model = True if task_name in mcc_tasks and result[ 'matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] save_model = True print('Best result:', result) if save_model: if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) snapshot_save_path = os.path.join(args.model_save_dir) print("Saving best model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) if args.save_last_snapshot: snapshot_save_path = args.model_save_dir print("Saving model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
summary = Summary(args.log_dir, args, modelSize) # snapshot = Snapshot(args.model_save_dir, args.model_load_dir) print("Restoring model from {}.".format(args.model_load_dir)) checkpoint = flow.train.CheckPoint() checkpoint.load(args.model_load_dir) if args.use_int8_online: for j in range(10): <<<<<<< HEAD flow.tensorrt.cache_int8_calibration() ======= InferenceNet().get() flow.tensorrt.cache_int8_calibration() flow.tensorrt.write_int8_calibration("./int8_calibration") >>>>>>> tianshu warmup = 2 for j in range(warmup): InferenceNet().get() metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(args.num_epochs): for j in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(0, j)) if __name__ == "__main__": main()
def compute_scores(self, gt, preds): ret = {'ndcg': Metric.ndcg(gt, preds), 'auc': Metric.auc(gt, preds)} return ret
import os import config as configs from util import Summary, InitNodes, Metric from job_function_util import get_val_config parser = configs.get_parser() args = parser.parse_args() configs.print_args(args) flow.config.gpu_device_num(args.gpu_num_per_node) #flow.config.enable_debug_mode(True) @flow.global_function(get_val_config(args)) def IOTest(): if args.train_data_dir: assert os.path.exists(args.train_data_dir) print("Loading data from {}".format(args.train_data_dir)) (labels, images) = load_imagenet_for_training(args) else: print("Loading synthetic data.") (labels, images) = load_synthetic(args) outputs = {"images": images, "labels": labels} return outputs total_device_num = args.num_nodes * args.gpu_num_per_node train_batch_size = total_device_num * args.batch_size_per_device summary = Summary(args.log_dir, args, filename='io_test.csv') metric = Metric(desc='io_test', calculate_batches=args.loss_print_every_n_iter, summary=summary, save_summary_steps=args.loss_print_every_n_iter, batch_size=train_batch_size, prediction_key=None) for i in range(1000): IOTest().async_get(metric.metric_cb(0, i))