def summarize_model(model_name, dataset_name, novalidate): """ Trains the model with k-fold cross-validation and then generate the summary. Similar to sklearn.model_selection.cross_val_predict :param model_name: filename in model package representing the pipeline or model object :param dataset_name: dataset name to load :param novalidate: If True, trains the data into whole dataset and save the model """ model = __import__("models.%s" % model_name, globals(), locals(), ['model']).model X, y = load_dataset(dataset_name) y_complete_pred = np.zeros_like(y).astype('float') if not novalidate: folds = load_folds() for i, (train_index, val_index) in enumerate(folds): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] model.fit(X_train, y_train) y_pred = model.predict_proba(X_val)[:, 1] y_pred_train = model.predict_proba(X_train)[:, 1] # Copying the values to generate predictions of complete dataset y_complete_pred[val_index] = y_pred # Saving the model save_model(model, "%s_%s_fold%d" % (dataset_name, model_name, i)) print "[Fold %d]: " % (i + 1) print "Fold Summary: ", print "Training AUPRC - %8.4f" % average_precision_score( y_train, y_pred_train) analyze_results(y_val, y_pred) print y_complete_pred.dump( os.path.join(RESULTS_PATH, '%s_%s.npy' % (dataset_name, model_name))) save_features(y_complete_pred, 'probs/%s_%s' % (dataset_name, model_name)) print "Complete Summary: ", analyze_results(y, y_complete_pred) print "\nModel parameters: " pprint.pprint(model.get_params(), indent=4, depth=1) print else: model.fit(X, y) y_pred_train = model.predict_proba(X)[:, 1] print "Training AUPRC - %8.4f" % average_precision_score( y, y_pred_train) save_model(model, '%s_%s' % (dataset_name, model_name))
def train(**kwargs): # 根据命令行参数更新配置 opt = DefaultConfig() opt.parse(kwargs) print("参数配置完成") # 优化器 learning_rate = opt.learning_rate # optimizer默认是Adam optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, beta2=0.9) if opt.optimizer_type == "SGD": optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif opt.optimizer_type == "Momentum": momentum = opt.momentum optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) elif opt.optimizer_type == "Adam": optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, beta2=0.9) # 建立静态图 with tf.Graph().as_default(): with tf.name_scope("inputs"): inputs = tf.placeholder("float", [None, 24, 2, 2], name="model_input") labels = tf.placeholder("float", [None, 72, 14, 2], name="labels") # 定义模型,统计并分类需要训练的模型参数 model = [] if opt.model_type == 1: # 反卷积 gmodel = GModel(opt.batch_size, opt.normal_type, True, "generate_model") model.append(gmodel) elif opt.model_type == 2: # 反卷积+可学习pooling gmodel = GModel(opt.batch_size, opt.normal_type, True, "generate_model") model.append(gmodel) learningpoolingmodel = LearningPoolingModel( opt.batch_size, opt.normal_type, True, opt.model_2_layers, "learning_pooling_model") model.append(learningpoolingmodel) elif opt.model_type == 3: # 反卷积+GAN gmodel = GModel(opt.batch_size, opt.normal_type, True, "generate_model") model.append(gmodel) dmodel = DModel(opt.batch_size, opt.normal_type, True, opt.GAN_type, "discriminate_model") model.append(dmodel) # print(model) # 统计并分类需要训练的参数 # 由于下面加上了对tf.GraphKeys.UPDATE_OPS的依赖,所以get_vars函数要加到calculate_loss函数后面 # 不然就会导致all_vars为空 def get_vars(): all_vars = tf.trainable_variables() # print(all_vars) gg_vars = [var for var in all_vars if "generate_model" in var.name] dd_vars = [ var for var in all_vars if "discriminate_mode" in var.name ] ll_pp_vars = [ var for var in all_vars if "learning_pooling_model" in var.name ] return gg_vars, dd_vars, ll_pp_vars # 加上对update_ops的依赖,不然BN就会出现问题! update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.device(opt.gpu_num): if opt.model_type == 1: # 反卷积 pre_loss, mse, pred = model[0].calculate_loss(inputs, labels) g_vars, _, _ = get_vars() with tf.control_dependencies(update_ops): train_ops = optimizer.minimize(pre_loss, var_list=g_vars) elif opt.model_type == 2: # 反卷积+可学习pooling _, mse, pred = model[0].calculate_loss(inputs, labels) l_p_loss = model[1].calculate_loss(pred, labels, opt.model_2_scale) g_vars, _, l_p_vars = get_vars() with tf.control_dependencies(update_ops): train_ops = optimizer.minimize(l_p_loss, var_list=g_vars + l_p_vars) elif opt.model_type == 3: # 反卷积+GAN pre_loss, mse, pred = model[0].calculate_loss(inputs, labels) gen_loss, dis_loss = model[1].calculate_loss(pred, labels) g_vars, d_vars, _ = get_vars() with tf.control_dependencies(update_ops): # D网络的训练 --> G网络的训练 ——> 先验网络(也就是G网络)的训练 d_train_ops = optimizer.minimize(dis_loss, var_list=d_vars) g_train_ops = optimizer.minimize(gen_loss, var_list=g_vars) pre_train_ops = optimizer.minimize(pre_loss, var_list=g_vars) tf.summary.scalar("MSE", mse) tf.add_to_collection("input_batch", inputs) tf.add_to_collection("predictions", pred) saver = tf.train.Saver() init = tf.global_variables_initializer() # 开始训练 config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.per_process_gpu_memory_fraction with tf.Session(config=config) as sess: # 首先是参数的初始化 sess.run(init) if opt.model_type == 1: model_type = "model_1" elif opt.model_type == 2: model_type = "model_2_" + str(opt.model_2_layers) elif opt.model_type == 3: model_type = "model_3" summary_path = opt.summary_path + model_type + "\\data_SNR_" + str( opt.SNR) writer = tf.summary.FileWriter(summary_path, sess.graph) merge_ops = tf.summary.merge_all() start = time.time() data_path = opt.train_data_path + "data_SNR_" + str(opt.SNR) # 定义训练集dataset train_dataset = CSISet(data_path, opt.batch_size, True, state="train") # 定义验证集dataset validation_dataset = CSISet(data_path, opt.batch_size, True, state="validation") # 保存训练集和验证集的中间值,用于后续的画图 train_mse_for_plot = [] valid_mse_for_plot = [] for num in range(opt.num_epoch): # 判断是否需要改变学习率 if opt.optimizer_type == "Momentum" and ( num % opt.learning_rate_change_epoch) == 0: learning_rate *= opt.learning_rate_decay print("第%i个epoch开始,当前学习率是%f" % (num, learning_rate)) for ii, (batch_x, batch_y) in enumerate(train_dataset.get_data()): if opt.model_type == 1 or opt.model_type == 2: _, train_mse, summary = sess.run( [train_ops, mse, merge_ops], feed_dict={ inputs: batch_x, labels: batch_y }) elif opt.model_type == 3: _, _, _, train_mse, summary = sess.run([ d_train_ops, g_train_ops, pre_train_ops, mse, merge_ops ], feed_dict={ inputs: batch_x, labels: batch_y }) writer.add_summary(summary) if (ii + 1) % 1000 == 0: print("epoch-%d, batch_num-%d: 当前batch训练数据误差是%f" % (num + 1, ii + 1, train_mse)) # 每1000个batch就在验证集上测试一次 validate_mse = 0 jj = 1 for (validate_x, validate_y) in validation_dataset.get_data(): temp_mse = sess.run(mse, feed_dict={ inputs: validate_x, labels: validate_y }) validate_mse += temp_mse jj += 1 validate_mse = validate_mse / (jj + 1) print("epoch-%d: 当前阶段验证集数据平均误差是%f" % (num + 1, validate_mse)) train_mse_for_plot.append(train_mse) valid_mse_for_plot.append(validate_mse) end = time.time() utils.print_time(start, end, "跑完" + str(opt.num_epoch) + "个epoch") plot_path = opt.result_path + model_type + "\\data_SNR_" + str( opt.SNR) + "\\train" utils.plot_fig(train_mse_for_plot, valid_mse_for_plot, plot_path) print("训练过程中最小验证误差是%f" % min(valid_mse_for_plot)) # 保存模型文件 model_file = opt.model_path + model_type + "\\data_SNR_" + str( opt.SNR) + "\\data_SNR_" + str(opt.SNR) model_utils.save_model(saver, sess, model_file)
def train_model(config): if config.start_date is not None: print("Training start date: ", config.start_date) if config.start_date is not None: print("Training end date: ", config.end_date) print("Loading training data from %s ..." % config.datafile) train_data = None valid_data = None if (config.validation_size > 0.0) or (config.split_date is not None): train_data, valid_data = data_utils.load_train_valid_data(config) else: train_data = data_utils.load_all_data(config, is_training_only=True) valid_data = train_data tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: if config.seed is not None: tf.set_random_seed(config.seed) print("Constructing model ...") model = model_utils.get_model(session, config, verbose=True) params = model_utils.get_scaling_params(config, train_data, verbose=True) model.set_scaling_params(session, **params) noise_model = None if config.training_noise is not None: print("Training noise level: %.2f * 1-stdev" % config.training_noise) noise_model = NoiseModel(seed=config.seed, scaling_params=params, degree=config.training_noise) if config.early_stop is not None: print("Training will early stop without " "improvement after %d epochs." % config.early_stop) sys.stdout.flush() train_history = list() valid_history = list() lr = model.set_learning_rate(session, config.learning_rate) train_data.cache(verbose=True) valid_data.cache(verbose=True) for i in range(config.max_epoch): (train_mse, valid_mse) = run_epoch(session, model, train_data, valid_data, keep_prob=config.keep_prob, passes=config.passes, noise_model=noise_model, verbose=True) print(( 'Epoch: %d Train MSE: %.6f Valid MSE: %.6f Learning rate: %.4f' ) % (i + 1, train_mse, valid_mse, lr)) sys.stdout.flush() train_history.append(train_mse) valid_history.append(valid_mse) if re.match("Gradient|Momentum", config.optimizer): lr = model_utils.adjust_learning_rate(session, model, lr, config.lr_decay, train_history) if not os.path.exists(config.model_dir): print("Creating directory %s" % config.model_dir) os.mkdir(config.model_dir) if math.isnan(valid_mse): print("Training failed due to nan.") quit() elif stop_training(config, valid_history): print("Training stopped.") quit() else: if ((config.early_stop is None) or (valid_history[-1] <= min(valid_history))): model_utils.save_model(session, config, i)
def train_model(config): if config.start_date is not None: print("Training start date: ", config.start_date) if config.start_date is not None: print("Training end date: ", config.end_date) print("Loading training data from %s ..."%config.datafile) train_data = None valid_data = None if (config.validation_size > 0.0) or (config.split_date is not None): train_data, valid_data = data_utils.load_train_valid_data(config) else: train_data = data_utils.load_all_data(config, is_training_only=True) valid_data = train_data tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) tf_config.gpu_options.allow_growth = True with tf.Graph().as_default(), tf.Session(config=tf_config) as session: if config.seed is not None: tf.set_random_seed(config.seed) print("Constructing model ...") model = model_utils.get_model(session, config, verbose=True) if config.data_scaler is not None: start_time = time.time() print("Calculating scaling parameters ...", end=' '); sys.stdout.flush() scaling_params = train_data.get_scaling_params(config.data_scaler) model.set_scaling_params(session,**scaling_params) print("done in %.2f seconds."%(time.time() - start_time)) print("%-10s %-6s %-6s"%('feature','mean','std')) for i in range(len(train_data.feature_names)): center = "%.4f"%scaling_params['center'][i]; scale = "%.4f"%scaling_params['scale'][i]; print("%-10s %-6s %-6s"%(train_data.feature_names[i], center,scale)) sys.stdout.flush() if config.early_stop is not None: print("Training will early stop without " "improvement after %d epochs."%config.early_stop) train_history = list() valid_history = list() lr = model.set_learning_rate(session, config.learning_rate) train_data.cache(verbose=True) valid_data.cache(verbose=True) for i in range(config.max_epoch): # MVE Epoch if config.UQ_model_type == 'MVE': (train_mse, train_mse_var, valid_mse, valid_mse_var) = run_epoch_mve(session, model, train_data, valid_data, keep_prob=config.keep_prob, passes=config.passes, verbose=True) # Status to check if valid mse is nan, used to stop training if math.isnan(valid_mse): is_metric_nan = True else: is_metric_nan = False print('Epoch: %d Train MSE: %.8f Valid MSE: %.8f Learning rate: %.4f' % (i + 1, train_mse, valid_mse, lr)) print('Epoch: %d Train MSE_w_variance: %.8f Valid MSE_w_variance: %.8f Learning rate: %.4f' % (i + 1, train_mse_var, valid_mse_var, lr)) sys.stdout.flush() train_history.append(train_mse_var) valid_history.append(valid_mse_var) # PIE Epoch elif config.UQ_model_type == 'PIE': (train_mpiw, train_picp, train_picp_loss, valid_mpiw, valid_picp, valid_picp_loss) = \ run_epoch_pie(session, model, train_data, valid_data, keep_prob=config.keep_prob, passes=config.passes, verbose=True) train_loss = train_mpiw + config.picp_lambda*train_picp_loss valid_loss = valid_mpiw + config.picp_lambda*valid_picp_loss # Status to check if valid loss is nan, used to stop training if math.isnan(valid_loss): is_metric_nan = True else: is_metric_nan = False print('Epoch: %d Train MPIW: %.8f Valid MPIW: %.8f Learning rate: %.4f' % (i + 1, train_mpiw, valid_mpiw, lr)) print('Epoch: %d Train PICP: %.8f Valid PICP: %.8f' % (i + 1, train_picp, valid_picp)) print('Epoch: %d Train LOSS: %.8f Valid LOSS: %.8f' % (i + 1, train_loss, valid_loss )) sys.stdout.flush() train_history.append(train_loss) valid_history.append(valid_loss) if re.match("Gradient|Momentum", config.optimizer): lr = model_utils.adjust_learning_rate(session, model, lr, config.lr_decay, train_history) if not os.path.exists(config.model_dir): print("Creating directory %s" % config.model_dir) os.mkdir(config.model_dir) if is_metric_nan: print("Training failed due to nan.") quit() elif stop_training(config, valid_history): print("Training stopped.") quit() else: if ( (config.early_stop is None) or (valid_history[-1] <= min(valid_history)) ): model_utils.save_model(session, config, i)
def fit_model(args, X, y): print(f'Fitting model for {args.model}:') auc = EpochScoring(scoring='roc_auc', lower_is_better=False) apr = EpochScoring(scoring='average_precision', lower_is_better=False) lrs = LRScheduler(policy='StepLR', step_size=10, gamma=0.5) if args.model == 'glm': glm = LogitNet(alpha=0.5, n_lambda=50, n_jobs=-1) glm.fit(X, y) kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1111) net = LogitNet(alpha=0.5, n_lambda=1, lambda_path=[glm.lambda_best_]) if args.model == 'standard': net = NeuralNetClassifier( models.MpraDense, batch_size=256, optimizer=torch.optim.Adam, optimizer__weight_decay=2e-6, lr=1e-4, max_epochs=20, module__n_input=1079, module__n_units=(400, 250), module__dropout=0.3, callbacks=[auc, apr], iterator_train__shuffle=True, train_split=None ) elif args.model == 'neighbors': net = NeuralNetClassifier( models.MpraFullCNN, batch_size=256, optimizer=torch.optim.Adam, optimizer__weight_decay=1e-2, lr=5e-5, max_epochs=20, callbacks=[auc, apr], iterator_train__shuffle=True, train_split=None ) # generate CV predictions np.random.seed(1000) torch.manual_seed(1000) kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1000) cv_scores = cross_val_predict(net, X, y, cv=kf, method='predict_proba', n_jobs=-1) AUC = roc_auc_score(y, cv_scores[:, 1]) APR = average_precision_score(y, cv_scores[:, 1]) print('\tAUC ', np.round(AUC, 4)) print('\tAPR ', np.round(APR, 4)) save_scores(args, cv_scores[:, 1], y) # refit and store model on all data net.fit(X, y) save_model(net, args.project, args.model)
type=float, default=1.0) parser.add_argument('--project_id', help='ID (not name) of your project', required=True) parser.add_argument( '--job-dir', help='Output directory for model, automatically provided by gcloud', required=True) args = parser.parse_args() arguments = args.__dict__ print(arguments) estimator, acc_eval = model.train_and_evaluate( arguments['eval_size'], arguments['frac'], arguments['WE_max_df'], arguments['WE_min_df'], arguments['FT_norm'], arguments['M_alpha'], arguments['max_nb_label']) if estimator is not None: loc = model_utils.save_model(estimator, arguments['job_dir'], 'stackoverlow') print("Saved model to {}".format(loc)) # this is for hyperparameter tuning hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='accuracy', metric_value=acc_eval, global_step=0)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--config", default=None, type=str, required=True, help="the training config file") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--multi_task", action="store_true", help="training with multi task schema") parser.add_argument("--debug", action="store_true", help="in debug mode, will not enable wandb log") parser.add_argument("--use_wandb", action="store_true", help="whether or not use wandb") args = parser.parse_args() cfg = parse_cfg(pathlib.Path(args.config)) # set CUDA_VISIBLE_DEVICES and get num_gpus if args.local_rank == -1: # not distributed os.environ["CUDA_VISIBLE_DEVICES"] = cfg["system"][ "cuda_visible_devices"] num_gpus = torch.cuda.device_count() args.distributed = False else: # distributed torch.cuda.set_device(args.local_rank) num_gpus = 1 args.distributed = True # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() logger.info( "num_gpus: {}, distributed training: {}, 16-bits training: {}".format( num_gpus, bool(args.local_rank != -1), cfg["train"]["fp16"])) cudnn.benchmark = True cfg["train"]["output_dir"] = cfg["train"]["output_dir"] + "/" + \ cfg["train"]["task_name"] + "_" + \ cfg["train"]["model_name"] + "_" + \ cfg["data"]["corpus"] output_dir_pl = pathlib.Path(cfg["train"]["output_dir"]) if output_dir_pl.exists(): logger.warn( "output directory ({}) already exists, continue after 2 seconds..." .format(output_dir_pl)) time.sleep(2) else: output_dir_pl.mkdir(parents=True, exist_ok=True) if not args.debug and args.use_wandb: config_dictionary = dict(yaml=cfg, params=args) wandb.init(config=config_dictionary, project="nlp-task", dir=cfg["train"]["output_dir"]) wandb.run.name = cfg["data"]["corpus"] + '-' + cfg["train"][ "pretrained_tag"] + '-' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) wandb.config.update(args) wandb.run.save() if cfg["optimizer"]["gradient_accumulation_steps"] < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(cfg["optimizer"]["gradient_accumulation_steps"])) # true batch_size in training cfg["train"]["batch_size"] = cfg["train"]["batch_size"] // cfg[ "optimizer"]["gradient_accumulation_steps"] # the type of label_map is bidict # label_map[x] = xx, label_map.inv[xx] = x label_map, num_labels = get_label_map(cfg) tokenizer, model = get_tokenizer_and_model(cfg, label_map, num_labels) # check model details on wandb if not args.debug and args.use_wandb: wandb.watch(model) num_examples, train_dataloader = get_dataloader(cfg, tokenizer, num_labels, "train", debug=args.debug) _, eval_dataloader = get_dataloader(cfg, tokenizer, num_labels, "dev", debug=args.debug) # total training steps (including multi epochs) num_training_steps = int( len(train_dataloader) // cfg["optimizer"]["gradient_accumulation_steps"] * cfg["train"]["train_epochs"]) optimizer = AdamW(params=model.parameters(), lr=cfg["optimizer"]["lr"]) lr_scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=cfg["optimizer"]["num_warmup_steps"], num_training_steps=num_training_steps) scaler = None model = model.cuda() if cfg["train"]["fp16"] and _use_apex: logger.error("using apex amp for fp16...") model, optimizer = amp.initialize(model, optimizer, opt_level="O1") elif cfg["train"]["fp16"] and _use_native_amp: logger.error("using pytorch native amp for fp16...") scaler = torch.cuda.amp.GradScaler() elif cfg["train"]["fp16"] and (_use_apex is False and _use_native_amp is False): logger.error("your environment DO NOT support fp16 training...") exit() if cfg["system"]["distributed"]: # TODO distributed debug model.cuda(args.local_rank) from torch.nn.parallel import DistributedDataParallel as DDP model = DDP(model, device_ids=[args.local_rank]) elif num_gpus > 1: model = torch.nn.DataParallel(model) # Train logger.info("start training on train set") epoch = 0 best_score = -1 for _ in trange(int(cfg["train"]["train_epochs"]), desc="Epoch"): best = False # train loop in one epoch train_loop(cfg, model, train_dataloader, optimizer, lr_scheduler, num_gpus, epoch, scaler, args.debug, args.use_wandb) # begin to evaluate logger.info("running evaluation on dev set") score = eval_loop(cfg, tokenizer, model, eval_dataloader, label_map, args.debug, args.use_wandb) if best_score < score: best_score = score best = True # Save a trained model and the associated configuration save_model(cfg, tokenizer, model, best) epoch += 1 # Test Eval if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info("running evaluation on final test set") # TODO add stand alone test set _, eval_dataloader = get_dataloader(cfg, tokenizer, num_labels, "dev", debug=args.debug) score = eval_loop(cfg, tokenizer, model, eval_dataloader, label_map, args.debug, args.use_wandb)
def train(args): # parameters num_gpus = args.num_gpu batch_size = args.batch_size * num_gpus start_epoch = args.start_epoch epochs = args.epochs torch.backends.cudnn.benchmark = True if num_gpus > 1: # net = nn.DataParallel(net) os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = "6066" torch.distributed.init_process_group(backend='nccl', world_size=1, rank=0, init_method='env://') # variables device = cfg.device(num_gpus) train_loader = cfg.train_loader(num_gpus) val_loader = cfg.val_loader() net = cfg.model() # criterion = nn.CrossEntropyLoss().to(device) criterion = SoftCrossEntropyLoss(label_smoothing=0.1, num_classes=cfg.num_classes).to(device) optimizer = optim.SGD(model_utils.split_weights(net), lr=args.lr, momentum=0.9, weight_decay=1e-4) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 40], gamma=0.1) # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) writer = SummaryWriter(log_dir=cfg.log_dir) # load weights or init weights if start_epoch > 0: weight_path = cfg.snapshot_save_path.format(start_epoch) print("load weight from file {}".format(weight_path)) checkpoint = torch.load(weight_path) net.load_state_dict(checkpoint) else: model_utils.init_weights(net) net.to(device) if num_gpus > 1: # net = nn.DataParallel(net) net = nn.parallel.DistributedDataParallel(net) print("type of net:{}".format(type(net))) # training for epoch in range(start_epoch, epochs): running_loss, running_corrects = 0.0, 0.0 start_time = timeit.default_timer() net.train() # scheduler.step(epoch) adjust_learning_rate(optimizer, epoch, args) _add_weight_history(writer, net, epoch) for images, labels in tqdm(train_loader): # print(type(labels), type(images)) images, labels = images.to(device), labels.to(device) optimizer.zero_grad() # 梯度置0 if args.mixup: inputs, targets_a, targets_b, lam = model_utils.mix_up_data( images, labels, args.alpha, True) outputs = net(inputs) loss_func = model_utils.mix_up_criterion( targets_a, targets_b, lam) loss = loss_func(criterion, outputs) else: outputs = net.forward(images) # [B,class_logits] loss = criterion(outputs, labels) # backward loss.backward() optimizer.step() _, preds = torch.max(outputs, 1) running_loss += loss.item() * images.size(0) running_corrects += torch.sum(preds == labels).item() train_loss = running_loss / len(train_loader.dataset) train_acc = 100. * running_corrects / len(train_loader.dataset) # 记录日志 writer.add_scalar('scalar/learning_rate', optimizer.param_groups[0]['lr'], epoch + 1) writer.add_scalar('scalar/train_loss', train_loss, epoch + 1) writer.add_scalar('scalar/train_acc', train_acc, epoch + 1) # 打印状态信息 print("[{}] Epoch: {}/{} Loss: {:03f} Acc: {:03f}".format( 'train', epoch + 1, epochs, train_loss, train_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") # 验证集 acc = val(net, val_loader, device) writer.add_scalar('scalar/val_acc', acc, epoch + 1) print('Epoch: {}/{} Val Acc:{:03f}'.format(epoch + 1, epochs, acc)) # 保存中间模型 if (epoch + 1) % cfg.SNAPSHOT == 0: # torch.save(net.state_dict(), cfg.snapshot_save_path.format(epoch + 1)) model_utils.save_model(net, cfg.snapshot_save_path.format(epoch + 1)) # 保存最终模型 # torch.save(net.state_dict(), cfg.save_path) model_utils.save_model(net, cfg.save_path)
def train_model(config): print("Loading training data ...") train_data = None valid_data = None if config.early_stop is None: train_data = data_utils.load_all_data(config, is_training_only=True) valid_data = train_data else: train_data, valid_data = data_utils.load_train_valid_data(config) if config.start_date is not None: print("Training start date: ", config.start_date) if config.start_date is not None: print("Training end date: ", config.end_date) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: if config.seed is not None: tf.set_random_seed(config.seed) print("Constructing model ...") model = model_utils.get_model(session, config, verbose=True) if config.data_scaler is not None: start_time = time.time() print("Calculating scaling parameters ...", end=' '); sys.stdout.flush() scaling_params = train_data.get_scaling_params(config.data_scaler) model.set_scaling_params(session,**scaling_params) print("done in %.2f seconds."%(time.time() - start_time)) #print(scaling_params['center']) #print(scaling_params['scale']) #exit(0) if config.early_stop is not None: print("Training will early stop without " "improvement after %d epochs."%config.early_stop) train_history = list() valid_history = list() lr = model.set_learning_rate(session,config.learning_rate) train_data.cache(verbose=True) valid_data.cache(verbose=True) for i in range(config.max_epoch): (train_mse, valid_mse) = run_epoch(session, model, train_data, valid_data, keep_prob=config.keep_prob, passes=config.passes, verbose=True) print( ('Epoch: %d Train MSE: %.6f Valid MSE: %.6f Learning rate: %.4f') % (i + 1, train_mse, valid_mse, lr) ) sys.stdout.flush() train_history.append( train_mse ) valid_history.append( valid_mse ) if re.match("Gradient|Momentum",config.optimizer): lr = model_utils.adjust_learning_rate(session, model, lr, config.lr_decay, train_history ) if not os.path.exists(config.model_dir): print("Creating directory %s" % config.model_dir) os.mkdir(config.model_dir) if math.isnan(valid_mse): print("Training failed due to nan.") quit() elif stop_training(config,valid_history): print("Training stopped.") quit() else: if ( (config.early_stop is None) or (valid_history[-1] <= min(valid_history)) ): model_utils.save_model(session,config,i)
def run(cv_method='loo', anom_type='mean'): args = config.parse_arguments( argv[1] if len(argv) >= 2 else _DEFAULT_CONFIG) # Load the data us_maize_regions = [ 'Indiana', 'Illinois', 'Ohio', 'Nebraska', 'Iowa', 'Minnesota' ] # Growing season: April through to September data = data_loading.load_temp_precip_data('Maize', 'Spring', 'USA', us_maize_regions, range(3, 9), anom_type=anom_type) if args.model.lower() == 'corr_bvg' or args.model == 'uncorr_bvg': save_path = f'models/saved_models/{args.model}_save' load_path = f'{save_path}.pkl' if not os.path.exists(load_path): model = models.models.fetch_model(args.model) save_model(model=model, file_path=save_path) else: # Load model to circumvent compile time model = load_model(load_path) batched = False # Fit the model fit = model.sampling(data, chains=args.chains, iter=args.iter, verbose=args.verbose, seed=args.seed) elif args.model.lower() == 'gp': kernel = RBF(length_scale=0.5) model = GaussianProcessRegressor(kernel=kernel, normalize_y=True, random_state=42) batched = True elif args.model.lower() == 'lr': # model = LinearRegression(fit_intercept=True, normalize=True) model = RidgeCV() batched = True else: raise ValueError('Invalid model type.') if cv_method == 'rolling': # Rolling-origin cross-validation print("===> Rolling-origin CV") cv_results = validation.sliding_window_cv(model, data, args, batched=batched) elif cv_method == 'time-series': # Time-series cross validation, incrementing by one year each split print("===> Time-series CV") n_splits = 34 cv_results = validation.time_series_cv(model, data, args, n_splits=n_splits, batched=batched) elif cv_method == 'loo': # LOO cross-validation print("===> LOO CV") cv_results = validation.leave_p_out_cv(model, data, args, p=1, batched=batched) else: # LTO cross-validation print("===> LTO CV") cv_results = validation.leave_p_out_cv(model, data, args, p=3, batched=batched) print_metrics(cv_results)
def main(config): # set up workspace work_space = config["workspace"] tf_board = config["tf_board"] setup_workpath(work_space) name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] # Build the model and compute losses (encode_num_layers, encode_num_units, encode_cell_type, encode_bidir, attn_num_units, decode_num_layers, decode_num_units, decode_cell_type, use_user_feat,use_gate_memory,use_user_desc,use_blog_user_coattn, use_external_desc_express,use_external_feat_express, user_feat_dim,user_feat_unit,user_feat_mem_unit, desc_rnn_unit,desc_attn_num_units,user_map_unit, ) = get_pcgn_model_config(config) (train_file, dev_file, source_max_length, target_max_length, desc_max_length, gpu_fraction, gpu_id, train_steps, checkpoint_every, print_every, batch_size,is_beam_search,beam_size,infer_max_iter, l2_regularize,learning_rate,max_checkpoints,max_gradient_norm, ) = get_pcgn_training_config(config) train_set=read_data(train_file) print(' # train data:',len(train_set)) dev_set=read_data(dev_file) print(' # dev data:',len(dev_set)) print("Building model architecture ") pcg_model = PCGNModel( mode='train', model_name=name, vocab_size=vocab_size, embedding_size=embed_size, encode_num_layers=encode_num_layers, encode_num_units=encode_num_units, encode_cell_type=encode_cell_type, encode_bidir=encode_bidir, attn_num_units=attn_num_units, decode_num_layers=decode_num_layers, decode_num_units=decode_num_units, decode_cell_type=decode_cell_type, use_user_feat=use_user_feat, use_gate_memory=use_gate_memory, use_user_desc=use_user_desc, use_blog_user_coattn=use_blog_user_coattn, use_external_desc_express=use_external_desc_express, use_external_feat_express=use_external_feat_express, user_feat_dim=user_feat_dim, user_feat_unit=user_feat_unit, user_feat_mem_unit=user_feat_mem_unit, desc_rnn_unit=desc_rnn_unit, desc_attn_num_units=desc_attn_num_units, user_map_unit=user_map_unit, batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, target_max_length=target_max_length, l2_regularize=l2_regularize, learning_rate=learning_rate, max_to_keep=max_checkpoints, max_gradient_norm=max_gradient_norm, ) print("\tDone.") logdir = '%s/nn_models/' % work_space # Set up session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id,allow_growth=True) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # tensorbord if use_tensorboard: train_writer = tf.summary.FileWriter(tf_board + 'train/', sess.graph) test_writer = tf.summary.FileWriter(tf_board + 'test/', sess.graph) try: saved_global_step = load_model(pcg_model.saver, sess, logdir) if saved_global_step is None: saved_global_step = -1 except Exception: print("Something went wrong while restoring checkpoint. " "Training is terminated to avoid the overwriting.") raise # ##### Training ##### # Training last_saved_step = saved_global_step num_steps = saved_global_step + train_steps steps = [] previous_losses=[] lr = pcg_model.learning_rate print("Start training ...") print('steps per epoch:',len(train_set)//batch_size) try: for step in range(saved_global_step + 1, num_steps): start_time = time.time() batch = get_pcgn_batch(train_set,'train', batch_size,source_max_length, target_max_length,desc_max_length) loss_value = pcg_model.train(sess, batch) previous_losses.append(loss_value) lr_decay_step = 10 if step % 500 == 0 and len(previous_losses)-5 > lr_decay_step and np.mean(previous_losses[-5:]) >= np.mean(previous_losses[-lr_decay_step -5:-5]): lr=pcg_model.learning_rate if lr > 0.00001: pcg_model.learning_rate=lr*0.9 print('learning rate decay:',lr*0.9) duration = (time.time() - start_time) if step % print_every == 0 and step != 0: # train perplexity t_perp = pcg_model.compute_perplexity(sess, batch) if use_tensorboard: add_summary(train_writer, step, 'train perplexity', t_perp) # eval perplexity dev_str = "" if dev_set is not None: eval_batch = get_pcgn_batch(dev_set,'train', batch_size,source_max_length, target_max_length,desc_max_length) eval_perp = pcg_model.compute_perplexity(sess, eval_batch) with open(logdir+'eval_perp.txt','a',encoding='utf-8') as f: f.write('{}\t{}\n'.format(str(step),str(eval_perp))) if use_tensorboard: add_summary(test_writer, step, 'eval perplexity', eval_perp) dev_str += "val_prep: {:.3f}\n".format(eval_perp) steps.append(step) ep=step//(len(train_set)//batch_size) info = 'epoch {:d}, step {:d},lr:{:.5f}, loss = {:.6f},perp: {:.3f}\n{}({:.3f} sec/step)' print(info.format(ep,step,lr, loss_value, t_perp, dev_str, duration)) if step % checkpoint_every == 0: save_model(pcg_model.saver, sess, logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C so save message is on its own line. print() finally: if step > last_saved_step: save_model(pcg_model.saver, sess, logdir, step)
def main(): # Arguments ########################################################################### try: args = get_args() config = process_config(args.config) except: logging.error("Missing or invalid arguments.") exit(0) # Logging ########################################################################### logging.basicConfig( filename=os.path.join("logs", config.exp_name + ".log"), format="[%(asctime)s] - [%(levelname)s]: %(message)s", filemode="a", level=logging.DEBUG, ) logging.info("Logging started.") logging.info("Keras version: {}".format(keras_version)) # Session ########################################################################### sess = tf.Session() K.set_session(sess) # create experiment related directories ########################################################################### create_dirs([config.summary_dir, config.checkpoint_dir]) # Initialize the model ########################################################################### model_formicID = load_model(config=config, num_species=97) model_formicID = compile_model(model=model_formicID, config=config) model_formicID = weights_load( model=model_formicID, weights= "experiments/T97_CaAll_QuM_ShSti_AugM_D05_LR0001_E200_I4_def_clean/checkpoint/weights_55-1.76.hdf5", ) # Training in batches with iterator ########################################################################### history = trainer_dir( model=model_formicID, config=config, callbacks=build_logger(config=config, model=model_formicID), ) save_model(model=model_formicID, filename="final_weights.hdf5", config=config) # Evaluation ########################################################################### plot_history(history=history, config=config, theme="ggplot", save=None) evaluator(model=model_formicID, config=config, test_dir=None) # Testing ########################################################################### Y_true, Y_pred, labels, species_dict = predictor( model=model_formicID, config=config, # species_json="data/species_dict.json", plot=True, n_img=10, n_cols=3, ) predictor_reports( Y_true=Y_true, Y_pred=Y_pred, config=config, species_dict=species_dict, target_names=labels, digits=5, ) plot_confusion_matrix( Y_pred=Y_pred, Y_true=Y_true, config=config, target_names=labels, species_dict=species_dict, title=None, cmap="viridis", normalize=True, scores=True, score_size=8, save="confusion_matrix.png", ) # Footer ########################################################################### K.clear_session() logging.info("Logging ended.")
def train_model(train_anns, eval_anns, model, device, model_save_dir, ckpt='model_latest.pth', num_epoches=10, batch_size=4, ap='ap_50', ap_thre=0.5, ap_range=3, ap_shift_thre=0.001): # optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # load pretrain best model ckpt_path = os.path.join(model_save_dir, ckpt) model, optimizer, start_epoch = load_model(model, ckpt_path, optimizer) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=num_epoches) # train/eval dataset dataset = Detection_Dataset_anns(train_anns, get_transform(True)) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4, collate_fn=collate_fn) dataset_eval = Detection_Dataset_anns(eval_anns, get_transform(False)) dataloader_eval = torch.utils.data.DataLoader(dataset_eval, batch_size=1, shuffle=False, pin_memory=True, num_workers=4, collate_fn=collate_fn) # 检测参数 writer = SummaryWriter(log_dir='runs/{}'.format(get_curtime())) ap_records = {'ap_50': [], 'ap_75': [], 'ap_shift': []} for epoch in range(start_epoch, num_epoches): # epoch 要从0开始,内部有 warm_up # train train_one_epoch(model, optimizer, dataloader, device, epoch, print_freq=10, writer=writer, begin_step=epoch * len(dataloader)) # store & update lr writer.add_scalar('Train/lr', optimizer.param_groups[0]["lr"], global_step=epoch) lr_scheduler.step() # eval after each train evals = evaluate(model, dataloader_eval, device, writer, epoch) # states ap_records['ap_50'].append(evals['ap_50']) ap_records['ap_75'].append(evals['ap_75']) if len(ap_records[ap]) >= ap_range: ap_shift = lasso_shift(ap_records[ap][-ap_range:]) else: ap_shift = 0 ap_records['ap_shift'].append(ap_shift) writer.add_scalar('Accuracy/AP_shift', ap_shift, global_step=epoch) if evals[ap] > ap_thre: ckpt_path = os.path.join(model_save_dir, 'model_{}.pth'.format(epoch)) save_model(ckpt_path, model, epoch, optimizer) if 0 < ap_shift < ap_shift_thre: # break and save ap records best_idx_in_range = ap_records[ap].index( max(ap_records[ap][-ap_range:])) best_epoch = epoch - ap_range + 1 + best_idx_in_range print('best epoch:', best_epoch) save_clean_best_model(best_epoch, model_save_dir)
def main(opt): path_save_model_ = './model_save/' if not os.path.exists(path_save_model_): os.mkdir(path_save_model_) torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test opt = opts().update_dataset_info_and_set_heads(opt, LoadImagesAndLabels) os.environ['CUDA_VISIBLE_DEVICES'] = '0' cuda = torch.cuda.is_available() device_ = torch.device('cuda' if cuda else 'cpu') opt.device = device_ chunk_sizes_ = [8] gpus_ = [0] # resnet_18 ,resnet_34 ,resnet_50,resnet_101,resnet_152 model_arch = 'resnet_34' print('Creating model...') num_layer = int(model_arch.split("_")[1]) num_classes = 1 heads_ = {'hm': num_classes, 'wh': 2, 'reg': 2} print('heads : {}'.format(heads_)) model = resnet(num_layers=num_layer, heads=heads_, head_conv=64, pretrained=True) # res_18 # print(model) batch_size_ = 16 num_workers_ = 4 learning_rate_ = 1.25e-4 path_load_model_ = './model_save/model_hand_last.pth' # path_load_model_ = '' lr_step_ = [190, 220] optimizer = torch.optim.Adam(model.parameters(), learning_rate_) start_epoch = 0 if os.path.exists(path_load_model_): model, optimizer, start_epoch = load_model(model, path_load_model_, optimizer, True, learning_rate_, lr_step_) trainer = CtdetTrainer(opt, model, optimizer) trainer.set_device(gpus_, chunk_sizes_, device_) print('load train_dataset') train_dataset = LoadImagesAndLabels(state='train', path_='../done/') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size_, shuffle=True, num_workers=num_workers_, pin_memory=False, drop_last=True) print('Starting training...') print("using arch : {}".format(model_arch)) print('num_classes : {}'.format(num_classes)) print('batch_size : {}'.format(batch_size_)) print('num_workers : {}'.format(num_workers_)) print('learning_rate : {}'.format(learning_rate_)) print('lr_step : {}'.format(lr_step_)) print('path_load_model : {}'.format(path_load_model_)) for epoch in range(start_epoch + 1, opt.num_epochs + 1): log_dict_train, _ = trainer.train(epoch, train_loader) save_model(path_save_model_ + 'model_hand_last.pth', epoch, model, optimizer) if epoch % 1 == 0: save_model(path_save_model_ + 'hand_epoch{}.pth'.format(epoch), epoch, model, optimizer) if epoch in lr_step_: save_model(path_save_model_ + 'model_hand_{}.pth'.format(epoch), epoch, model, optimizer) lr = learning_rate_ * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr
def run_episode(agent, env, pub_result, pub_get_action, run_id, episode_number, global_step, param_dictionary, start_time, scores, episodes, log, log_title, save_model_to_disk): """ Runs an episode in the chosen stage of the Gazebo Simulation Episode ends when a goal is reached or after episode max steps in the agent is exceeded :param agent: RL agent to act in the Gazebo environment :param env: Gazebo simulation :param pub_result: Rospy publisher for the latest score and q-values :param pub_get_action: Rospy publisher of the latest action and reward :param run_id: ID for logging purposes :param episode_number: current episode number :param global_step: all steps of all episodes counted :param param_dictionary: dict which contains all model parameters :param start_time: start time of run :param scores: list of cumulated rewards per episode :param episodes: list containing all episodes done :param log: logging object :param log_title: string title of log :param save_model_to_disk: boolean if model should be saved to disk """ result = Float32MultiArray() get_action = Float32MultiArray() state = env.reset() score = 0 for episode_step in range(agent.episode_max_step): goal = env.get_goal() action = agent.get_action(state) next_state, reward, done = env.step(action) if episode_step >= agent.episode_max_step - 1: rospy.loginfo("Time out!!") done = True position = env.get_position() agent.append_memory(state, action, reward, next_state, done) log_utils.make_log_entry(log, log_title, run_id, episode_number, episode_step, state, next_state, goal, position, action, agent.q_value, reward, done) if len(agent.memory) >= agent.train_start: if global_step <= agent.target_update: agent.train_model() else: agent.train_model(True) score += reward state = next_state get_action.data = [action, score, reward] pub_get_action.publish(get_action) if save_model_to_disk and episode_step == 0: save_model(agent, param_dictionary, episode_number) if done: result.data = [score, np.max(agent.q_value)] pub_result.publish(result) agent.update_target_model() scores.append(score) episodes.append(episode_number) log_episode_info(episode_number, score, agent, start_time) param_keys = ['epsilon', 'episode'] param_values = [agent.epsilon, episode_number] param_dictionary = dict(zip(param_keys, param_values)) return run_id, global_step, param_dictionary global_step += 1 if global_step % agent.target_update == 0: rospy.loginfo("UPDATE TARGET NETWORK")