def train():
    evaluate_env_list_path = 'env_list_set1'
    print(evaluate_reject_when_full(evaluate_env_list_path))
    print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim = 24
    rpm = ReplayMemory(MEMORY_SIZE)

    actor = PDActor(obs_dim=obs_dim, action_dim=action_dim)
    critic = PDCritirc(obs_dim=obs_dim, action_dim=action_dim)
    agent = Agent(actor=actor,
                  critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim)

    # preserve some data in replay memory
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env, agent, rpm)

    max_episode = 2000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(env, agent, rpm)
            episode += 1

        # test part
        # render=True show animation result
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
示例#2
0
def train(lr=0.001, num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\
    evaluate_env_list_path='env_list_set1', \
    train_total_time=600, show_baseline=False, \
    continue_train=False, model_path = 'best_actor'):
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env(total_time=train_total_time)
    action_dim = 4
    obs_dim_1 = 45
    request_dim = 17
    obs_dim_2 = 10
    obs_dim = obs_dim_1 + obs_dim_2 * 7
    encoder = Encoder(input_size=request_dim, output_size=obs_dim_2)
    actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim)
    agent = Agent(actor=actor, obs_dim=obs_dim, action_dim=action_dim)
    if continue_train:
        agent.load(model_path)
    for iter in range(num_iter):
        #2.1  Using theta k to interact with the env
        # to collect {s_t, a_t} and compute advantage
        # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})

        all_obs = []
        all_action = []
        for episode in range(num_episode):
            obs_list, action_list, _ = run_episode_baseline(env)
            all_obs.extend(obs_list)
            all_action.extend(action_list)

        # optimize theta
        for epoch in range(num_epoch):
            # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader):
            # agent.learn(batch_obs, batch_action, batch_adv)
            num_examples = len(all_obs)
            indices = list(range(num_examples))
            random.shuffle(indices)

            for i in range(0, num_examples, batch_size):

                if i + batch_size < len(all_obs):
                    # print(indice[i:i+batch_size])
                    batch_obs = [all_obs[x] for x in indices[i:i + batch_size]]
                    batch_action = torch.tensor(
                        [all_action[x] for x in indices[i:i + batch_size]])
                else:
                    batch_obs = [all_obs[x] for x in indices[i:num_examples]]
                    batch_action = torch.tensor(
                        [all_action[x] for x in indices[i:num_examples]])

                agent.learn(batch_obs, batch_action)
        if iter % 10 == 0:
            eval_reward = evaluate(evaluate_env_list_path, agent,
                                   render=False)  # render=True 查看显示效果
            print('itern:{}  Test reward:{}'.format(iter, eval_reward))
            agent.save(model_path)
        agent.save(model_path)
示例#3
0
def train(gamma = 0.9, base_line=0.5, lr=0.0001, total_time=20, \
    num_iter = 1000, num_episode=10, num_epoch=10, \
    evaluate_env_list_path = 'env_list_set1', show_base_line=False):
    # clip epsilon
    # EPSILON = 0.1
    # # total number of experiments
    # num_iter = 1000
    # # for each experiment, simulate num_episode trajectories.
    # num_episode = 10
    # # for each experiment, tuning num_epoch times
    # num_epoch = 10
    # evaluate_env_list_path = 'env_list_set1'
    if show_base_line:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env(total_time=total_time)
    action_dim = 4
    obs_dim = 45
    PPOactor = Actor(obs_size=obs_dim, action_size=action_dim)
    agent = Agent(actor=PPOactor,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=lr)

    for iter in range(num_iter):
        #2.1  Using theta k to interact with the env
        # to collect {s_t, a_t} and compute advantage
        # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})

        all_obs = []
        all_action = []
        all_advantage = []
        for episode in range(num_episode):
            obs_list, action_list, reward_list = run_episode(env, agent)
            advantage_list = calc_advantage(reward_list,
                                            gamma=gamma,
                                            base_line=base_line)
            all_obs.extend(obs_list)
            all_action.extend(action_list)
            all_advantage.extend(advantage_list)
        dataset = PPODataset(obs_list=all_obs,
                             action_list=all_action,
                             advantage_list=all_advantage)
        dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

        # optimize theta

        for epoch in range(num_epoch):
            for i, (batch_obs, batch_action,
                    batch_adv) in enumerate(dataloader):
                agent.learn(batch_obs, batch_action, batch_adv)

        if iter % 10 == 0:
            eval_reward = evaluate(evaluate_env_list_path, agent,
                                   render=False)  # render=True 查看显示效果
            print('itern:{}  Test reward:{}'.format(iter, eval_reward))
示例#4
0
def train(show_baseline=False, continue_train=False, \
    model_save_path='best_model', learn_freq= 5, memory_size = 20000, \
    memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \
    gamma = 0.9, alpha = 0.9, max_episode=1000, ):

    evaluate_env_list_path = 'env_list_set1'
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim_1 = 45
    request_dim = 17
    obs_dim_2 = 10
    obs_dim = obs_dim_1 + obs_dim_2 * 7
    encoder = Encoder(input_size=request_dim, output_size=obs_dim_2, \
        use_rnn=False, use_gru=True, use_lstm=False)
    rpm = ReplayMemory(memory_size)  # DQN的经验回放池
    critic = Critic(obs_dim=obs_dim, action_dim=action_dim, encoder=encoder)
    agent = Agent(critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=learning_rate,
                  gamma=gamma,
                  alpha=alpha)

    if continue_train:
        agent.load(model_save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < memory_warmup_size:
        run_episode(env, agent, rpm, memory_warmup_size, learn_freq,
                    batch_size)

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 100):
            total_reward = run_episode(env, agent, rpm, memory_warmup_size,
                                       learn_freq, batch_size)
            episode += 1
        # for parameter in critic.parameters():
        #     print(parameter)
        #     break
        # test part
        # print(critic.parameters())
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
    agent.save(model_save_path)
示例#5
0
def train(show_baseline=False, continue_train=False, \
    model_save_path='best_model', learn_freq= 5, memory_size = 20000, \
    memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \
    gamma = 0.9, alpha = 0.9, max_episode=1000, ):

    evaluate_env_list_path = 'env_list_set1'
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim = 45
    rpm = ReplayMemory(memory_size)  # DQN的经验回放池

    critic = Critic(obs_dim=obs_dim, action_dim=action_dim)
    agent = Agent(critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=learning_rate,
                  gamma=gamma,
                  alpha=alpha)

    if continue_train:
        agent.load(model_save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < memory_warmup_size:
        run_episode(env, agent, rpm, memory_warmup_size, learn_freq,
                    batch_size)

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 10):
            total_reward = run_episode(env, agent, rpm, memory_warmup_size,
                                       learn_freq, batch_size)
            episode += 1
        # for name, param in critic.state_dict().items():
        #     # name: str
        #     # param: Tensor
        #     print(param)
        # test part
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
    agent.save(model_save_path)
示例#6
0
    def train(self, env, evaluate_env_path, gamma=0.9):
        env = self.env
        action_dim = env.action_space.n
        obs_dim = env.observation_space.shape[0]
        PPOactor = Actor(obs_size=obs_dim, action_size=action_dim)
        agent = Agent(actor=PPOactor, obs_dim=obs_dim, action_dim=action_dim)

        for iter in range(self.num_iter):
            #2.1  Using theta k to interact with the env
            # to collect {s_t, a_t} and compute advantage
            # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})

            all_obs = []
            all_action = []
            all_advantage = []
            for episode in range(self.num_episode):
                obs_list, action_list, reward_list = self.run_episode(
                    env, agent)
                advantage_list = self.calc_advantage(reward_list, gamma=gamma)
                all_obs.extend(obs_list)
                all_action.extend(action_list)
                all_advantage.extend(advantage_list)
            dataset = Dataset(obs_list=all_obs,
                              action_list=all_action,
                              advantage_list=all_advantage)
            dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

            # optimize theta

            for epoch in range(self.num_epoch):
                for i, (batch_obs, batch_action,
                        batch_adv) in enumerate(dataloader):
                    agent.learn(batch_obs, batch_action, batch_adv)

            if iter % 10 == 0:
                eval_reward = evaluate(evaluate_env_path, agent,
                                       render=False)  # render=True 查看显示效果
                print('itern:{}  Test reward:{}'.format(iter, eval_reward))
示例#7
0
def train(gamma = 0.9, base_line=0.5, lr=0.001, epsilon=0.1, \
    num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\
    evaluate_env_list_path='env_list_set1',\
    train_total_time=600, show_baseline=False):
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env(total_time=train_total_time)
    action_dim = 4  
    obs_dim_1 = 45  
    request_dim = 17
    obs_dim_2 = 10
    obs_dim = obs_dim_1+obs_dim_2*7
    encoder = Encoder(input_size=request_dim, output_size=obs_dim_2)
    actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim)
    agent = Agent(
        actor=actor,
        obs_dim = obs_dim,
        action_dim=action_dim,
        lr=lr,
        epsilon=epsilon,
        update_target_steps=200)

    for iter in range(num_iter):
        #2.1  Using theta k to interact with the env
        # to collect {s_t, a_t} and compute advantage
        # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})
        
        all_obs = []
        all_action = []
        all_advantage = []
        for episode in range(num_episode):
            obs_list, action_list, reward_list = run_episode(env, agent)
            advantage_list = calc_advantage(reward_list, gamma, base_line)
            all_obs.extend(obs_list)
            all_action.extend(action_list)
            all_advantage.extend(advantage_list)
        # dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage)
        # dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

        # optimize theta
        for epoch in range(num_epoch):
            # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader):
                # agent.learn(batch_obs, batch_action, batch_adv)
            num_examples = len(all_obs) 
            indices = list(range(num_examples)) 
            random.shuffle(indices)
            
            for i in range(0, num_examples, batch_size):
                
                if i+batch_size<len(all_obs):
                    # print(indice[i:i+batch_size])
                    batch_obs = [all_obs[x] for x in indices[i:i+batch_size]]
                    batch_action = torch.tensor([all_action[x] for x in indices[i:i+batch_size]])
                    batch_adv = torch.tensor([all_advantage[x] for x in indices[i:i+batch_size]])
                else:
                    batch_obs = [all_obs[x] for x in indices[i:num_examples]]
                    batch_action = torch.tensor([all_action[x] for x in indices[i:num_examples]])
                    batch_adv = torch.tensor([all_advantage[x] for x in indices[i:num_examples]])

                agent.learn(batch_obs, batch_action, batch_adv)
        if iter%10 == 0:
            eval_reward= evaluate(evaluate_env_list_path, agent, render=False)  # render=True 查看显示效果
            print('itern:{}  Test reward:{}'.format(iter, eval_reward))
示例#8
0
                writer.add_scalar('Loss_crossEntropy/train',
                                  float(running_loss / opt.print_freq), iter)
                # trainset accuracy
                accuracy = correct * 100.0 / total
                writer.add_scalar('Accuracy/train', accuracy, iter)
                print(
                    "iteration: %d, loss: %.4f, accuracy on %d train images: %.3f %%"
                    % (iter, running_loss / opt.print_freq, total, accuracy))
                writer.add_graph(net, inputs)
                running_loss = 0.0
                correct = 0
                total = 0
            if iter % opt.save_latest_freq == 0:
                save_networks(opt, net, 'latest')
                print('saving the latest model (epoch %d, iter %d)' %
                      (epoch, iter))

        # testset accuracy
        test_accuracy = evaluate(net, testloader, device)
        print("Accuracy on testset of epoch %d (iter: %d )is %.3f %%" %
              (epoch, iter, test_accuracy))
        writer.add_scalar('Accuracy/test', test_accuracy, iter)

        if epoch % opt.save_epoch_freq == 0:
            save_networks(opt, net, epoch)

        scheduler.step()
        lr = optimizer.param_groups[0]['lr']
        print('learning rate = %.7f' % lr)
    writer.close()
示例#9
0
文件: main.py 项目: mufeili/HMGNN
def trainIter(model,
              optimizer,
              trn_param,
              prpty,
              model_dir,
              val_data=None,
              batch_size=32,
              total_steps=3000000,
              save_steps=5000,
              eval_steps=5000,
              tol_steps=1000000,
              decrease_steps=2000000,
              lr_decrease_rate=0.1):

    best_mae = trn_param['best_mae']
    best_iter = trn_param['best_iter']
    iteration = trn_param['iteration']
    log = trn_param['log']
    dur = []
    dur_val = []

    dist_graph_loss, line_graph_loss, combined_loss = 0., 0., 0.
    for it in range(iteration + 1, total_steps + 1):
        t0 = time.time()
        batch_hg, dg_node_feat_discrete, lg_node_feat_continuous, lg_node_feat_discrete, dg_edge_feat, lg_edge_feat, y = \
            trn_data.next_random_batch(batch_size, prpty)

        cuda_hg = batch_hg.to(device)
        dg_node_feat_discrete = dg_node_feat_discrete.to(device)
        lg_node_feat_continuous = lg_node_feat_continuous.to(device)
        lg_node_feat_discrete = lg_node_feat_discrete.to(device)
        dg_edge_feat = dg_edge_feat.to(device)
        lg_edge_feat = lg_edge_feat.to(device)

        y = y.to(device)

        dg_loss, lg_loss, cb_loss = train(cuda_hg, dg_node_feat_discrete,
                                          lg_node_feat_continuous,
                                          lg_node_feat_discrete, dg_edge_feat,
                                          lg_edge_feat, y, model, optimizer)

        dist_graph_loss += dg_loss
        line_graph_loss += lg_loss
        combined_loss += cb_loss

        t1 = time.time()
        dur.append(t1 - t0)

        if it % eval_steps == 0:
            dg_val_mae, lg_val_mae, cb_val_mae, _ = evaluate(
                model, val_data, prpty, 128, False)
            dur_val.append(time.time() - t1)

            mean_dur = np.mean(dur)
            mean_dur_val = np.mean(dur_val)

            print(
                '-----------------------------------------------------------------------'
            )
            print('Steps: %d / %d, time: %.4f, val_time: %.4f.' %
                  (it, total_steps, mean_dur, mean_dur_val))
            print(
                'Dist graph loss: %.6f, line graph loss: %.6f, combined loss: %.6f.'
                % (dist_graph_loss /
                   (eval_steps * batch_size), line_graph_loss /
                   (eval_steps * batch_size), combined_loss /
                   (eval_steps * batch_size)))
            print(
                'Val: Dist graph MAE: %.6f, line graph MAE: %.6f, combined MAE: %.6f.'
                % (dg_val_mae, lg_val_mae, cb_val_mae))

            log += '-----------------------------------------------------------------------\n'
            log += 'Steps: %d / %d, time: %.4f, val_time: %.4f. \n' % (
                it, total_steps, mean_dur, mean_dur_val)
            log += 'Dist graph loss: %.6f, line graph loss: %.6f, combined loss: %.6f. \n' % (
                dist_graph_loss / (eval_steps * batch_size), line_graph_loss /
                (eval_steps * batch_size), combined_loss /
                (eval_steps * batch_size))
            log += 'Val: Dist graph MAE: %.6f, line graph MAE: %.6f, combined MAE: %.6f. \n' % (
                dg_val_mae, lg_val_mae, cb_val_mae)

            if cb_val_mae < best_mae:
                best_mae = cb_val_mae
                best_iter = it
                torch.save(model, os.path.join(model_dir, 'Best_model.pt'))

            start = time.time()
            dist_graph_loss, line_graph_loss, combined_loss = 0., 0., 0.

        if it % decrease_steps == 0:
            optimizer = lr_scheduler(optimizer, lr_decrease_rate)

        # stop training if the mae does not decrease in tol_steps on validation set
        if it - best_iter > tol_steps:
            break

        if it % save_steps == 0:
            trn_param['iteration'] = it
            trn_param['best_mae'] = best_mae
            trn_param['best_iter'] = best_iter
            trn_param['log'] = log
            save_model_state(model, optimizer, trn_param,
                             os.path.join(model_dir, 'checkpoint.tar'))

            # write the log
            f = open(os.path.join(model_dir, 'log.txt'), 'w')
            f.write(log)
            f.close()

    # write the log
    log += 'The best iter is %d!, best val MAE is %.6f. \n' % (best_iter,
                                                               best_mae)
    f = open(os.path.join(model_dir, 'log.txt'), 'w')
    f.write(log)
    f.close()

    return best_mae, best_iter
示例#10
0
文件: main.py 项目: mufeili/HMGNN
                                        batch_size=args.batch_size,
                                        decrease_steps=args.decrease_steps,
                                        lr_decrease_rate=args.lr_decrease_rate)

    # test
    else:
        print(args)

        tst_data = DataLoader(os.path.join(args.data_dir, 'test.data'))
        trn_data = DataLoader(os.path.join(args.data_dir, 'train.data'))
        if args.prpty != 'gap':
            model = torch.load(os.path.join(args.model_dir,
                                            'Best_model.pt')).to(device)
            train_mae_1, train_mae_2, train_mae_all, train_attn = evaluate(
                model,
                trn_data,
                args.prpty,
                batch_size=args.batch_size,
                return_attn=args.return_attn)
            test_mae_1, test_mae_2, test_mae_all, test_attn = evaluate(
                model,
                tst_data,
                args.prpty,
                batch_size=args.batch_size,
                return_attn=args.return_attn)
            if args.return_attn:
                np.save(
                    os.path.join(args.model_dir,
                                 'train_attn_score_%s.npy' % (args.prpty)),
                    train_attn)
                np.save(
                    os.path.join(args.model_dir,