def train(): device = torch.device('cuda' if cfg.GPU[0] >= 0 else 'cpu') start_epoch = 1 if start_epoch == 1: train_log = open(os.path.join(cfg.LOG_DIR, "train_log.csv"), 'w') train_log_title = "epoch,total_loss,classify_loss,angle_loss,iou_loss\n" train_log.write(train_log_title) train_log.flush() else: train_log = open(os.path.join(cfg.LOG_DIR, "train_log.csv"), 'a') print('Creating model...') model = create_model() if start_epoch != 1: model = load_model( model, 'logs/weights/model_epoch_{}.pth'.format(start_epoch - 1)) optimizer = torch.optim.Adam(model.parameters(), cfg.LR) trainer = Trainer(model, optimizer) trainer.set_device(device) print('Setting up data...') train_loader = DataLoader(LatexDataset(), batch_size=cfg.BATCH_SIZE, shuffle=True, num_workers=cfg.NUM_WORKERS, pin_memory=True, drop_last=True) print('Starting training...') epoch = start_epoch for epoch in range(start_epoch, start_epoch + cfg.EPOCHS): trainer.train(epoch, train_loader, train_log) if epoch % 5 == 0: save_model('logs/weights/model_epoch_{}.pth'.format(epoch), epoch, model) save_model(os.path.join(cfg.WEIGHTS_DIR, 'model_last.pth'), epoch, model)
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.eval Dataset = get_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) if not opt.not_set_cuda_env: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') logger = Logger(opt) print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, opt=opt) optimizer = get_optimizer(opt, model) start_epoch = 0 lr = opt.lr if opt.load_model != '': model, optimizer, start_epoch = load_model( model, opt.load_model, opt, optimizer) trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.val_intervals < opt.num_epochs or opt.eval: print('Setting up validation data...') val_loader = torch.utils.data.DataLoader( Dataset(opt, opt.val_split), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.eval: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir, n_plots=opt.eval_n_plots, render_curves=opt.eval_render_curves) return print('Setting up train data...') train_loader = torch.utils.data.DataLoader( Dataset(opt, opt.train_split), batch_size=opt.batch_size, shuffle=opt.shuffle_train, num_workers=opt.num_workers, pin_memory=True, drop_last=True ) print('Starting training...') for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' # log learning rate for param_group in optimizer.param_groups: lr = param_group['lr'] logger.scalar_summary('LR', lr, epoch) break # train one epoch log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) # log train results for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) # evaluate if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) # evaluate val set using dataset-specific evaluator if opt.run_dataset_eval: out_dir = val_loader.dataset.run_eval(preds, opt.save_dir, n_plots=opt.eval_n_plots, render_curves=opt.eval_render_curves) # log dataset-specific evaluation metrics with open('{}/metrics_summary.json'.format(out_dir), 'r') as f: metrics = json.load(f) logger.scalar_summary('AP/overall', metrics['mean_ap']*100.0, epoch) for k,v in metrics['mean_dist_aps'].items(): logger.scalar_summary('AP/{}'.format(k), v*100.0, epoch) for k,v in metrics['tp_errors'].items(): logger.scalar_summary('Scores/{}'.format(k), v, epoch) logger.scalar_summary('Scores/NDS', metrics['nd_score'], epoch) # log eval results for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) # save this checkpoint else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.save_point: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) # update learning rate if epoch in opt.lr_step: lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) if not opt.not_set_cuda_env: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') logger = Logger(opt) print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, opt=opt) optimizer = get_optimizer(opt, model) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, opt, optimizer) ############################################3333 #freezing backbone and one head for param in model.parameters(): # print(param) param.requires_grad = False req_grad = ["model.hm_bdd", "model.wh_bdd", "model.reg_bdd"] # for hd in model.reg_tl: for custom_head in (req_grad): for hd in eval(custom_head): # print(hd.parameters()) for wt in hd.parameters(): # print(wt) wt.requires_grad = True ###################################################### trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.val_intervals < opt.num_epochs or opt.test: print('Setting up validation data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return print('Setting up train data...') train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) if opt.eval_val: val_loader.dataset.run_eval(preds, opt.save_dir) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') # if epoch in opt.save_point: if epoch % opt.save_point[0] == 0: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) if epoch in opt.lr_step: lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) if not opt.not_set_cuda_env: os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpus_str opt.device = torch.device("cuda" if opt.gpus[0] >= 0 else "cpu") logger = Logger(opt) print("Creating model...") model = create_model(opt.arch, opt.heads, opt.head_conv, opt=opt) optimizer = get_optimizer(opt, model) start_epoch = 0 if opt.load_model != "": model, optimizer, start_epoch = load_model( model, opt.load_model, opt, optimizer ) for i, param in enumerate(model.parameters()): param.requires_grad = True trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.val_intervals < opt.num_epochs or opt.test: print("Setting up validation data...") val_loader = torch.utils.data.DataLoader( Dataset(opt, "val"), batch_size=1, shuffle=False, num_workers=1, pin_memory=True, ) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return print("Setting up train data...") train_loader = torch.utils.data.DataLoader( Dataset(opt, "train"), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True, ) print("Starting training...") for epoch in range(start_epoch + 1, opt.num_epochs + 1): save_model( os.path.join(opt.save_dir, "model_{}.pth".format(epoch)), epoch, model, optimizer, ) mark = epoch if opt.save_all else "last" log_dict_train, _ = trainer.train(epoch, train_loader) logger.write("epoch: {} |".format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary("train_{}".format(k), v, epoch) logger.write("{} {:8f} | ".format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model( os.path.join(opt.save_dir, "model_{}.pth".format(mark)), epoch, model, optimizer, ) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) if opt.eval_val: val_loader.dataset.run_eval(preds, opt.save_dir) for k, v in log_dict_val.items(): logger.scalar_summary("val_{}".format(k), v, epoch) logger.write("{} {:8f} | ".format(k, v)) else: save_model( os.path.join(opt.save_dir, "model_last.pth"), epoch, model, optimizer ) logger.write("\n") # if epoch in opt.save_point: save_model( os.path.join(opt.save_dir, "model_{}.pth".format(epoch)), epoch, model, optimizer, ) if epoch in opt.lr_step: lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1)) print("Drop LR to", lr) for param_group in optimizer.param_groups: param_group["lr"] = lr logger.close()
def run_epoch(self, phase, epoch, data_loader, model, optimizer): model_with_loss = self.model_with_loss if phase == 'train': model_with_loss.train() else: if len(self.opt.gpus) > 1: model_with_loss = self.model_with_loss.module model_with_loss.eval() torch.cuda.empty_cache() opt = self.opt results = {} data_time, batch_time = AverageMeter(), AverageMeter() avg_loss_stats = {l: AverageMeter() for l in self.loss_stats \ if l == 'tot' or opt.weights[l] > 0} num_iters = len(data_loader) if opt.num_iters < 0 else opt.num_iters bar = Bar('{}/{}'.format(opt.task, opt.exp_id), max=num_iters) end = time.time() for iter_id, batch in enumerate(data_loader): if iter_id >= num_iters: break data_time.update(time.time() - end) for k in batch: if k != 'meta': batch[k] = batch[k].to(device=opt.device, non_blocking=True) output, loss, loss_stats = model_with_loss(batch) loss = loss.mean() if phase == 'train': self.optimizer.zero_grad() loss.backward() self.optimizer.step() batch_time.update(time.time() - end) end = time.time() if iter_id % int(num_iters / 10) == 0: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) Bar.suffix = '{phase}: [{0}][{1}/{2}]|Tot: {total:} |ETA: {eta:} '.format( epoch, iter_id, num_iters, phase=phase, total=bar.elapsed_td, eta=bar.eta_td) for l in avg_loss_stats: avg_loss_stats[l].update(loss_stats[l].mean().item(), batch['image'].size(0)) Bar.suffix = Bar.suffix + '|{} {:.4f} '.format( l, avg_loss_stats[l].avg) Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \ '|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time) if opt.print_iter > 0: # If not using progress bar if iter_id % opt.print_iter == 0: print('{}/{}| {}'.format(opt.task, opt.exp_id, Bar.suffix)) else: bar.next() if opt.debug > 0: self.debug(batch, output, iter_id, dataset=data_loader.dataset) del output, loss, loss_stats bar.finish() ret = {k: v.avg for k, v in avg_loss_stats.items()} ret['time'] = bar.elapsed_td.total_seconds() / 60. return ret, results
def online_fit(num_timesteps, num_targets, num_tweets=300): stabilize_logs() dir_path = os.path.dirname(os.path.abspath(__file__)) conn = sqlite3.connect(os.path.join(dir_path, 'historical.db')) cursor = conn.cursor() #for debugging to simulate a 1 hour pass time #cursor.execute("DELETE FROM historical ORDER BY date DESC LIMIT 1") cursor.execute("SELECT * FROM historical ORDER BY date DESC LIMIT 1") last_record = cursor.fetchall() from_date = arrow.get((float(last_record[0][0]) + 3600000) / 1000).format('YYYY-MM-DD HH:mm:ss') combined_length = num_timesteps + num_targets cursor.execute( "SELECT * FROM historical ORDER BY date DESC LIMIT {}".format( combined_length - 1) ) # need to fit with some data in the db as the model didn't fit itself with said data on the past fit precomputed_data = np.asarray(cursor.fetchall(), dtype=np.float32) precomputed_data = precomputed_data[::-1] file = open(os.path.join(dir_path, "logs/context_prices.txt"), "a") file.write(str(precomputed_data[-1][-1]) + "\n") file.close() conn.commit() conn.close() unseen_data = get_historical(num_tweets, from_date=from_date, is_online=True) #actual price from last prediction used for logging with twitter actual_price = unseen_data[0][-1] file = open(os.path.join(dir_path, "logs/actuals.txt"), "a") file.write(str(actual_price) + "\n") file.close() all_data = np.concatenate((precomputed_data, unseen_data), axis=0) # store recent data so that we can get a live prediction recent_reference = [] recent_data = all_data[-num_timesteps:, 1:] recent_data = normalize_timestep(recent_data, recent_reference) timesteps = split_into_timeseries(all_data, combined_length) reference = [] for i in range(0, len(timesteps)): timesteps[i] = normalize_timestep(timesteps[i], reference) split_index = len(timesteps[0]) - num_targets X_train = timesteps[:, :split_index] y_train = timesteps[:, split_index:, -1] model = load_model() #train the model print("TRAINING") model.fit(X_train, y_train, batch_size=512, epochs=10, validation_split=0, verbose=2) save_model(model) recent_data = np.asarray([recent_data.tolist()]) future = model.predict(recent_data) predictions = (future[0] + 1) * recent_reference[0] recent_data[0] = (recent_data[0] + 1) * recent_reference[0] # document results in file print("WRITING TO LOG") file = open(os.path.join(dir_path, "logs/log_online.txt"), "w") for timestep in recent_data: file.write(str(timestep) + "\n") file.write(str(future[0]) + "\n") file.close() file = open(os.path.join(dir_path, "logs/predictions.txt"), "a") file.write(str(predictions[0]) + "\n") file.close() log_to_twitter(predictions) return predictions
def initial_fit(num_timesteps, num_targets, train_percent=.93, num_tweets=300): print("started init fit") dir_path = os.path.dirname(os.path.abspath(__file__)) #clear contents of log files open(os.path.join(dir_path, 'logs/context_prices.txt'), 'w').close() open(os.path.join(dir_path, 'logs/actuals.txt'), 'w').close() open(os.path.join(dir_path, 'logs/predictions.txt'), 'w').close() open(os.path.join(dir_path, 'logs/history.txt'), 'w').close() open(os.path.join(dir_path, 'logs/proxy_log.txt'), 'w').close() data = get_historical(num_tweets, from_date="") X_train, y_train, X_test, y_test, ref = load_data( data, num_timesteps, num_targets=num_targets, train_percent=train_percent ) #TODO: make higher percentage of training when this goes into "prod" # store recent data so that we can get a live prediction recent_reference = [] recent_data = data[-num_timesteps:, 1:] recent_data = normalize_timestep(recent_data, recent_reference) print(" X_train", X_train.shape) print(" y_train", y_train.shape) print(" X_test", X_test.shape) print(" y_test", y_test.shape) model = build_model([9, num_timesteps, num_targets]) #train the model print("TRAINING") model.fit(X_train, y_train, batch_size=512, epochs=600, validation_split=0.1, verbose=2) save_model(model) trainScore = model.evaluate(X_train, y_train, verbose=100) print('Train Score: %.2f MSE (%.2f RMSE) (%.2f)' % (trainScore[0], math.sqrt(trainScore[0]), trainScore[1])) testScore = model.evaluate(X_test, y_test, verbose=100) print('Test Score: %.2f MSE (%.2f RMSE) (%.2f)' % (testScore[0], math.sqrt(testScore[0]), testScore[1])) #make predictions print("PREDICTING") p = model.predict(X_test) recent_data = [ recent_data ] # One-sample predictions need list wrapper. Argument must be 3d. recent_data = np.asarray(recent_data) future = model.predict(recent_data) # document results in file print("WRITING TO LOG") file = open(os.path.join(dir_path, "logs/log_initial.txt"), "w") for i in range(0, len(X_train)): for s in range(0, num_timesteps): file.write(str(X_train[i][s]) + "\n") file.write("Target: " + str(y_train[i]) + "\n") file.write("\n") for i in range(0, len(X_test)): for s in range(0, num_timesteps): file.write(str(X_test[i][s]) + "\n") file.write("Target: " + str(y_test[i]) + "\n") file.write("Prediction: " + str(p[i]) + "\n") file.write("\n") file.close() # de-normalize print("DENORMALIZING") for i in range(0, len(p)): p[i] = (p[i] + 1) * ref[round(.9 * len(ref) + i)] y_test[i] = (y_test[i] + 1) * ref[round(.9 * len(ref) + i)] future[0] = (future[0] + 1) * recent_reference[0] recent_data[0] = (recent_data[0] + 1) * recent_reference[0] file = open(os.path.join(dir_path, "logs/predictions.txt"), "a") file.write(str(future[0][0]) + "\n") file.close() # plot historical predictions print("PLOTTING") for i in range(0, len(p)): if i % (num_targets * 2) == 0: plot_index = i #for filling plot indexes plot_indexes = [] plot_values = p[i] for j in range(0, num_targets): plot_indexes.append(plot_index) plot_index += 1 plt.plot(plot_indexes, plot_values, color="red") # plot historical actual plt.plot(y_test[:, 0], color='blue', label='Actual') # actual price history # plot recent prices plot_indexes = [len(y_test) - 1] plot_values = [y_test[-1, 0]] plot_index = None for i in range(0, len(recent_data[0])): plot_values.append(recent_data[0][i][0]) plot_index = len(y_test) + i plot_indexes.append(len(y_test) + i) plt.plot(plot_indexes, plot_values, color='blue') # plot future predictions plot_indexes = [plot_index] plot_values = [recent_data[0][-1][0]] for i in range(0, len(future[0])): plot_index += 1 plot_values.append(future[0][i]) plot_indexes.append(plot_index) plt.plot(plot_indexes, plot_values, color="red", label="Prediction") #show/save plot print("SENDING EMAILS") plt.legend(loc="upper left") plt.title("ETH Price Predictions") plt.xlabel("Hours") plt.ylabel("Price ($)") filename = str(arrow.utcnow().format("YYYY-MM-DD")) plt.savefig(os.path.join(dir_path, "graphs/" + filename)) #plt.show() plt.close() send_email() return
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) if not opt.not_set_cuda_env: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') logger = Logger(opt) print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, opt=opt) optimizer = get_optimizer(opt, model) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, opt, optimizer) trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) print('Setting up train data...') train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') # for each epoch, record scale bestmota = 0 bestepoch = 0 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) # with torch.no_grad(): # log_dict_val, preds = trainer.val(epoch, val_loader) # if opt.eval_val: # val_loader.dataset.run_eval(preds, opt.save_dir) # for k, v in log_dict_val.items(): # logger.scalar_summary('val_{}'.format(k), v, epoch) # logger.write('{} {:8f} | '.format(k, v)) valset = '17halfval' mota, motp = prefetch_test(opt, valset) if mota > bestmota: bestmota = mota bestepoch = epoch print('mota = {}, motp = {}, bestmota = {}, bestepoch = {}'.format( mota, motp, bestmota, bestepoch)) logger.write('\n') if epoch in opt.save_point: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) if epoch in opt.lr_step: lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) # Log our parameters into mlflow for key, value in vars(opt).items(): mlflow.log_param(key, value) if not opt.not_set_cuda_env: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') logger = Logger(opt) print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, opt=opt) optimizer = get_optimizer(opt, model) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, opt, optimizer) trainer = Trainer(opt, model, optimizer) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.val_intervals < opt.num_epochs or opt.test: print('Setting up validation data...') val_loader = torch.utils.data.DataLoader(Dataset( opt, 'val', opt.data_name), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return print('Setting up train data...') train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train', opt.data_name), batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') best = 1e10 best_epoch = 1e10 for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) mlflow.log_metric('train_{}'.format(k), v, step=epoch) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) if opt.eval_val: val_loader.dataset.run_eval(preds, opt.save_dir) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) mlflow.log_metric('val_{}'.format(k), v, step=epoch) if log_dict_val[opt.metric] < best: best = log_dict_val[opt.metric] best_epoch = epoch save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch, model) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.save_point: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) # early stopping if isinstance(opt.early_stopping, int): if epoch - best_epoch > opt.early_stopping: msg = 'Stopped {} epoch. Best epoch is {}, score is {}.'.format( epoch, best_epoch, best) print(msg) logger.write(msg) break if epoch in opt.lr_step: lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset, prediction_model=True) if not opt.not_set_cuda_env: os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpus_str opt.device = torch.device("cuda" if opt.gpus[0] >= 0 else "cpu") device = opt.device logger = Logger(opt) print("Creating model...") model = DecoderRNN(128, opt) optimizer = get_optimizer(opt, model) start_epoch = 0 if opt.load_model_traj != "": model, optimizer, start_epoch = load_model(model, opt.load_model, opt, optimizer) loss_function = torch.nn.SmoothL1Loss() for i, param in enumerate(model.parameters()): param.requires_grad = True train_loader = torch.utils.data.DataLoader( Dataset(opt, "train"), batch_size=1, shuffle=True, num_workers=16, pin_memory=True, drop_last=True, ) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device=device, non_blocking=True) model = model.to(device) loss_function = loss_function.to(device) print("Starting training...") for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else "last" for iter_id, (inputs, targets) in enumerate(train_loader): inputs = inputs.to(device=device).float() targets = targets.to(device=device).view(1, -1).float() outputs = model(inputs) loss = loss_function(outputs, targets) if 100 * loss.item() < 20: loss = 100 * loss else: loss = 10 * loss optimizer.zero_grad() loss.backward() optimizer.step() del outputs, loss save_model(os.path.join(opt.save_dir, "model_last.pth"), epoch, model, optimizer) logger.write("\n") save_model( os.path.join(opt.save_dir, "model_{}.pth".format(epoch)), epoch, model, optimizer, ) if epoch in opt.lr_step: lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) for param_group in optimizer.param_groups: param_group["lr"] = lr logger.close()
time_str = time.strftime("%m%d-%H%M", time.localtime(time.time())) rootdir = "{}/{}/{}-semi-{}-fixed-{}-ratio-{}-lr-{}/".format( "/data/yangy/data_prepare/result", hp['dataname'], time_str, str(hp['semi']), str(hp['fixed']), str(hp['ratio']), str(args.lr)) os.makedirs(rootdir, exist_ok=True) hp['rootdir'] = rootdir np.save('{}parameter.npy'.format(rootdir), hp) # 获取模型 my_models = load_model(hp) #获取数据 train_data, test_data = load_data(hp) #预训练模型 #my_models = pre_train(hp, my_models, train_data, test_data) # 预训练结果 #result = test(test_data,hp,my_models,'pretrain') # 训练模型 my_models = train(hp, my_models, train_data) # 保存模型 save_model(my_models, rootdir) # 测试模型 result = test(test_data, hp, my_models, 'final')
def pre_train(hp, models, train_data, test_data): print("----------start pre-training models----------") view_num = len(models) par = [] for i in range(view_num): models[i].cuda() models[i].train() par.append({'params': models[i].parameters()}) optimizer = optim.Adam(par, lr=hp['pre_lr']) scheduler = StepLR(optimizer, step_size=10, gamma=0.5) batch_size = hp['pre_size'] loss_func = nn.MSELoss() for epoch in range(hp['pre_epoch']): scheduler.step() running_loss = 0.0 data_num = 0 for i in range(view_num): models[i].train() for i in range(3): data = train_data[i] if data == None: continue bag_num = len(data) data_num += bag_num max_step = int(bag_num / batch_size) while max_step * batch_size < bag_num: max_step += 1 for step in range(max_step): # get data step_data = get_batch( data, list( range(step * batch_size, min((step + 1) * batch_size, bag_num))), hp) x1, x2, bag1, bag2, y = step_data b_y = Variable(y).cuda() loss = 0 if i == 0 or i == 2: x_img = Variable(x1).cuda() h1, _, _ = models[0](x_img, bag1) loss += loss_func(h1, b_y) if i == 0 or i == 1: x_text = Variable(x2).cuda() h2, _, _ = models[1](x_text, bag2) loss += loss_func(h2, b_y) running_loss += loss.data * x2.size(0) # backward optimizer.zero_grad() loss.backward() optimizer.step() # epoch loss epoch_loss = running_loss / data_num print('epoch {}/{} | Loss: {:.9f}'.format(epoch, hp['pre_epoch'], epoch_loss)) rootpath = "{}{}/".format(hp['modelpath'], str(epoch + 1)) os.makedirs(rootpath, exist_ok=True) save_model(models, rootpath) hp['rootdir'] = rootpath result = test(test_data, hp, models, 'pretrain') print("----------end pre-training models----------") return models
def main(opt): torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test Dataset = get_dataset(opt.dataset) print(Dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) if not opt.not_set_cuda_env: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu') logger = Logger(opt) print('Creating model...') model = create_model(opt.arch, opt.heads, opt.head_conv, opt=opt) if opt.fix_backbone: for param in model.backbone.parameters(): param.requires_grad = False if opt.fix_dla_up: for param in model.neck.dla_up.parameters(): param.requires_grad = False if opt.fix_ida_up: for param in model.neck.ida_up.parameters(): param.requires_grad = False optimizer = get_optimizer(opt, model) start_epoch = 0 if opt.load_model != '': model, optimizer, start_epoch = load_model(model, opt.load_model, opt, optimizer) trainer = Trainer(opt, model, optimizer, logger) trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device) if opt.val_intervals < opt.num_epochs or opt.test: print('Setting up validation data...') val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) if opt.test: _, preds = trainer.val(0, val_loader) val_loader.dataset.run_eval(preds, opt.save_dir) return print('Setting up train data...') if opt.using_randomly_half: test_data = Dataset(opt, 'train') length = len(test_data) torch.random.manual_seed(opt.seed) actual_dataset, _ = torch.utils.data.random_split( test_data, [ int(length * opt.use_percent), length - int(length * opt.use_percent) ]) else: actual_dataset = Dataset(opt, 'train') train_loader = torch.utils.data.DataLoader(actual_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True, drop_last=True) print('Starting training...') for epoch in range(start_epoch + 1, opt.num_epochs + 1): mark = epoch if opt.save_all else 'last' log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if opt.val_intervals > 0 and epoch % opt.val_intervals == 0: save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) if opt.eval_val: val_loader.dataset.run_eval(preds, opt.save_dir) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) else: save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in opt.save_point: save_model( os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) if epoch in opt.lr_step: lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()