def train(): evaluate_env_list_path = 'env_list_set1' print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim = 24 rpm = ReplayMemory(MEMORY_SIZE) actor = PDActor(obs_dim=obs_dim, action_dim=action_dim) critic = PDCritirc(obs_dim=obs_dim, action_dim=action_dim) agent = Agent(actor=actor, critic=critic, obs_dim=obs_dim, action_dim=action_dim) # preserve some data in replay memory while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 2000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part # render=True show animation result eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward))
def train(lr=0.001, num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\ evaluate_env_list_path='env_list_set1', \ train_total_time=600, show_baseline=False, \ continue_train=False, model_path = 'best_actor'): if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env(total_time=train_total_time) action_dim = 4 obs_dim_1 = 45 request_dim = 17 obs_dim_2 = 10 obs_dim = obs_dim_1 + obs_dim_2 * 7 encoder = Encoder(input_size=request_dim, output_size=obs_dim_2) actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim) agent = Agent(actor=actor, obs_dim=obs_dim, action_dim=action_dim) if continue_train: agent.load(model_path) for iter in range(num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] for episode in range(num_episode): obs_list, action_list, _ = run_episode_baseline(env) all_obs.extend(obs_list) all_action.extend(action_list) # optimize theta for epoch in range(num_epoch): # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): # agent.learn(batch_obs, batch_action, batch_adv) num_examples = len(all_obs) indices = list(range(num_examples)) random.shuffle(indices) for i in range(0, num_examples, batch_size): if i + batch_size < len(all_obs): # print(indice[i:i+batch_size]) batch_obs = [all_obs[x] for x in indices[i:i + batch_size]] batch_action = torch.tensor( [all_action[x] for x in indices[i:i + batch_size]]) else: batch_obs = [all_obs[x] for x in indices[i:num_examples]] batch_action = torch.tensor( [all_action[x] for x in indices[i:num_examples]]) agent.learn(batch_obs, batch_action) if iter % 10 == 0: eval_reward = evaluate(evaluate_env_list_path, agent, render=False) # render=True 查看显示效果 print('itern:{} Test reward:{}'.format(iter, eval_reward)) agent.save(model_path) agent.save(model_path)
def train(gamma = 0.9, base_line=0.5, lr=0.0001, total_time=20, \ num_iter = 1000, num_episode=10, num_epoch=10, \ evaluate_env_list_path = 'env_list_set1', show_base_line=False): # clip epsilon # EPSILON = 0.1 # # total number of experiments # num_iter = 1000 # # for each experiment, simulate num_episode trajectories. # num_episode = 10 # # for each experiment, tuning num_epoch times # num_epoch = 10 # evaluate_env_list_path = 'env_list_set1' if show_base_line: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env(total_time=total_time) action_dim = 4 obs_dim = 45 PPOactor = Actor(obs_size=obs_dim, action_size=action_dim) agent = Agent(actor=PPOactor, obs_dim=obs_dim, action_dim=action_dim, lr=lr) for iter in range(num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] all_advantage = [] for episode in range(num_episode): obs_list, action_list, reward_list = run_episode(env, agent) advantage_list = calc_advantage(reward_list, gamma=gamma, base_line=base_line) all_obs.extend(obs_list) all_action.extend(action_list) all_advantage.extend(advantage_list) dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage) dataloader = DataLoader(dataset, batch_size=128, shuffle=True) # optimize theta for epoch in range(num_epoch): for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): agent.learn(batch_obs, batch_action, batch_adv) if iter % 10 == 0: eval_reward = evaluate(evaluate_env_list_path, agent, render=False) # render=True 查看显示效果 print('itern:{} Test reward:{}'.format(iter, eval_reward))
def train(show_baseline=False, continue_train=False, \ model_save_path='best_model', learn_freq= 5, memory_size = 20000, \ memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \ gamma = 0.9, alpha = 0.9, max_episode=1000, ): evaluate_env_list_path = 'env_list_set1' if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim_1 = 45 request_dim = 17 obs_dim_2 = 10 obs_dim = obs_dim_1 + obs_dim_2 * 7 encoder = Encoder(input_size=request_dim, output_size=obs_dim_2, \ use_rnn=False, use_gru=True, use_lstm=False) rpm = ReplayMemory(memory_size) # DQN的经验回放池 critic = Critic(obs_dim=obs_dim, action_dim=action_dim, encoder=encoder) agent = Agent(critic=critic, obs_dim=obs_dim, action_dim=action_dim, lr=learning_rate, gamma=gamma, alpha=alpha) if continue_train: agent.load(model_save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < memory_warmup_size: run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 100): total_reward = run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) episode += 1 # for parameter in critic.parameters(): # print(parameter) # break # test part # print(critic.parameters()) eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward)) agent.save(model_save_path)
def train(show_baseline=False, continue_train=False, \ model_save_path='best_model', learn_freq= 5, memory_size = 20000, \ memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \ gamma = 0.9, alpha = 0.9, max_episode=1000, ): evaluate_env_list_path = 'env_list_set1' if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim = 45 rpm = ReplayMemory(memory_size) # DQN的经验回放池 critic = Critic(obs_dim=obs_dim, action_dim=action_dim) agent = Agent(critic=critic, obs_dim=obs_dim, action_dim=action_dim, lr=learning_rate, gamma=gamma, alpha=alpha) if continue_train: agent.load(model_save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < memory_warmup_size: run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 10): total_reward = run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) episode += 1 # for name, param in critic.state_dict().items(): # # name: str # # param: Tensor # print(param) # test part eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward)) agent.save(model_save_path)
def train(self, env, evaluate_env_path, gamma=0.9): env = self.env action_dim = env.action_space.n obs_dim = env.observation_space.shape[0] PPOactor = Actor(obs_size=obs_dim, action_size=action_dim) agent = Agent(actor=PPOactor, obs_dim=obs_dim, action_dim=action_dim) for iter in range(self.num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] all_advantage = [] for episode in range(self.num_episode): obs_list, action_list, reward_list = self.run_episode( env, agent) advantage_list = self.calc_advantage(reward_list, gamma=gamma) all_obs.extend(obs_list) all_action.extend(action_list) all_advantage.extend(advantage_list) dataset = Dataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage) dataloader = DataLoader(dataset, batch_size=128, shuffle=True) # optimize theta for epoch in range(self.num_epoch): for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): agent.learn(batch_obs, batch_action, batch_adv) if iter % 10 == 0: eval_reward = evaluate(evaluate_env_path, agent, render=False) # render=True 查看显示效果 print('itern:{} Test reward:{}'.format(iter, eval_reward))
def train(gamma = 0.9, base_line=0.5, lr=0.001, epsilon=0.1, \ num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\ evaluate_env_list_path='env_list_set1',\ train_total_time=600, show_baseline=False): if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env(total_time=train_total_time) action_dim = 4 obs_dim_1 = 45 request_dim = 17 obs_dim_2 = 10 obs_dim = obs_dim_1+obs_dim_2*7 encoder = Encoder(input_size=request_dim, output_size=obs_dim_2) actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim) agent = Agent( actor=actor, obs_dim = obs_dim, action_dim=action_dim, lr=lr, epsilon=epsilon, update_target_steps=200) for iter in range(num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] all_advantage = [] for episode in range(num_episode): obs_list, action_list, reward_list = run_episode(env, agent) advantage_list = calc_advantage(reward_list, gamma, base_line) all_obs.extend(obs_list) all_action.extend(action_list) all_advantage.extend(advantage_list) # dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage) # dataloader = DataLoader(dataset, batch_size=128, shuffle=True) # optimize theta for epoch in range(num_epoch): # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): # agent.learn(batch_obs, batch_action, batch_adv) num_examples = len(all_obs) indices = list(range(num_examples)) random.shuffle(indices) for i in range(0, num_examples, batch_size): if i+batch_size<len(all_obs): # print(indice[i:i+batch_size]) batch_obs = [all_obs[x] for x in indices[i:i+batch_size]] batch_action = torch.tensor([all_action[x] for x in indices[i:i+batch_size]]) batch_adv = torch.tensor([all_advantage[x] for x in indices[i:i+batch_size]]) else: batch_obs = [all_obs[x] for x in indices[i:num_examples]] batch_action = torch.tensor([all_action[x] for x in indices[i:num_examples]]) batch_adv = torch.tensor([all_advantage[x] for x in indices[i:num_examples]]) agent.learn(batch_obs, batch_action, batch_adv) if iter%10 == 0: eval_reward= evaluate(evaluate_env_list_path, agent, render=False) # render=True 查看显示效果 print('itern:{} Test reward:{}'.format(iter, eval_reward))
writer.add_scalar('Loss_crossEntropy/train', float(running_loss / opt.print_freq), iter) # trainset accuracy accuracy = correct * 100.0 / total writer.add_scalar('Accuracy/train', accuracy, iter) print( "iteration: %d, loss: %.4f, accuracy on %d train images: %.3f %%" % (iter, running_loss / opt.print_freq, total, accuracy)) writer.add_graph(net, inputs) running_loss = 0.0 correct = 0 total = 0 if iter % opt.save_latest_freq == 0: save_networks(opt, net, 'latest') print('saving the latest model (epoch %d, iter %d)' % (epoch, iter)) # testset accuracy test_accuracy = evaluate(net, testloader, device) print("Accuracy on testset of epoch %d (iter: %d )is %.3f %%" % (epoch, iter, test_accuracy)) writer.add_scalar('Accuracy/test', test_accuracy, iter) if epoch % opt.save_epoch_freq == 0: save_networks(opt, net, epoch) scheduler.step() lr = optimizer.param_groups[0]['lr'] print('learning rate = %.7f' % lr) writer.close()
def trainIter(model, optimizer, trn_param, prpty, model_dir, val_data=None, batch_size=32, total_steps=3000000, save_steps=5000, eval_steps=5000, tol_steps=1000000, decrease_steps=2000000, lr_decrease_rate=0.1): best_mae = trn_param['best_mae'] best_iter = trn_param['best_iter'] iteration = trn_param['iteration'] log = trn_param['log'] dur = [] dur_val = [] dist_graph_loss, line_graph_loss, combined_loss = 0., 0., 0. for it in range(iteration + 1, total_steps + 1): t0 = time.time() batch_hg, dg_node_feat_discrete, lg_node_feat_continuous, lg_node_feat_discrete, dg_edge_feat, lg_edge_feat, y = \ trn_data.next_random_batch(batch_size, prpty) cuda_hg = batch_hg.to(device) dg_node_feat_discrete = dg_node_feat_discrete.to(device) lg_node_feat_continuous = lg_node_feat_continuous.to(device) lg_node_feat_discrete = lg_node_feat_discrete.to(device) dg_edge_feat = dg_edge_feat.to(device) lg_edge_feat = lg_edge_feat.to(device) y = y.to(device) dg_loss, lg_loss, cb_loss = train(cuda_hg, dg_node_feat_discrete, lg_node_feat_continuous, lg_node_feat_discrete, dg_edge_feat, lg_edge_feat, y, model, optimizer) dist_graph_loss += dg_loss line_graph_loss += lg_loss combined_loss += cb_loss t1 = time.time() dur.append(t1 - t0) if it % eval_steps == 0: dg_val_mae, lg_val_mae, cb_val_mae, _ = evaluate( model, val_data, prpty, 128, False) dur_val.append(time.time() - t1) mean_dur = np.mean(dur) mean_dur_val = np.mean(dur_val) print( '-----------------------------------------------------------------------' ) print('Steps: %d / %d, time: %.4f, val_time: %.4f.' % (it, total_steps, mean_dur, mean_dur_val)) print( 'Dist graph loss: %.6f, line graph loss: %.6f, combined loss: %.6f.' % (dist_graph_loss / (eval_steps * batch_size), line_graph_loss / (eval_steps * batch_size), combined_loss / (eval_steps * batch_size))) print( 'Val: Dist graph MAE: %.6f, line graph MAE: %.6f, combined MAE: %.6f.' % (dg_val_mae, lg_val_mae, cb_val_mae)) log += '-----------------------------------------------------------------------\n' log += 'Steps: %d / %d, time: %.4f, val_time: %.4f. \n' % ( it, total_steps, mean_dur, mean_dur_val) log += 'Dist graph loss: %.6f, line graph loss: %.6f, combined loss: %.6f. \n' % ( dist_graph_loss / (eval_steps * batch_size), line_graph_loss / (eval_steps * batch_size), combined_loss / (eval_steps * batch_size)) log += 'Val: Dist graph MAE: %.6f, line graph MAE: %.6f, combined MAE: %.6f. \n' % ( dg_val_mae, lg_val_mae, cb_val_mae) if cb_val_mae < best_mae: best_mae = cb_val_mae best_iter = it torch.save(model, os.path.join(model_dir, 'Best_model.pt')) start = time.time() dist_graph_loss, line_graph_loss, combined_loss = 0., 0., 0. if it % decrease_steps == 0: optimizer = lr_scheduler(optimizer, lr_decrease_rate) # stop training if the mae does not decrease in tol_steps on validation set if it - best_iter > tol_steps: break if it % save_steps == 0: trn_param['iteration'] = it trn_param['best_mae'] = best_mae trn_param['best_iter'] = best_iter trn_param['log'] = log save_model_state(model, optimizer, trn_param, os.path.join(model_dir, 'checkpoint.tar')) # write the log f = open(os.path.join(model_dir, 'log.txt'), 'w') f.write(log) f.close() # write the log log += 'The best iter is %d!, best val MAE is %.6f. \n' % (best_iter, best_mae) f = open(os.path.join(model_dir, 'log.txt'), 'w') f.write(log) f.close() return best_mae, best_iter
batch_size=args.batch_size, decrease_steps=args.decrease_steps, lr_decrease_rate=args.lr_decrease_rate) # test else: print(args) tst_data = DataLoader(os.path.join(args.data_dir, 'test.data')) trn_data = DataLoader(os.path.join(args.data_dir, 'train.data')) if args.prpty != 'gap': model = torch.load(os.path.join(args.model_dir, 'Best_model.pt')).to(device) train_mae_1, train_mae_2, train_mae_all, train_attn = evaluate( model, trn_data, args.prpty, batch_size=args.batch_size, return_attn=args.return_attn) test_mae_1, test_mae_2, test_mae_all, test_attn = evaluate( model, tst_data, args.prpty, batch_size=args.batch_size, return_attn=args.return_attn) if args.return_attn: np.save( os.path.join(args.model_dir, 'train_attn_score_%s.npy' % (args.prpty)), train_attn) np.save( os.path.join(args.model_dir,