Пример #1
0
def train_dqn(env, args, workdir):
    action_space_size = env.action_space.n
    if not os.path.exists('train_log'):
        os.mkdir('train_log')
    writer = TensorBoard(f'train_log/{args.run_name}')

    dqn_config = dqn_config_default.copy()

    dqn_config.update({
        "batch_size": 4096,
        "min_replay_history": 40960,
        "training_steps": 4000,
        "lr": 0.0001,
        "target_update_period": 500
    })

    policy = DQNTorchPolicy(env.observation_space, env.action_space, env.config, dqn_config)

    dqn_trainer = Trainer(env, policy, dqn_config)
    max_mean_reward = - 1000
    debug = False

    mean_cost_list = []
    total_cost_list = []

    for i in range(args.iters):
        result = dqn_trainer.train(i)
        now_mean_reward = print_result(action_space_size, result, writer, i)
        mean_cost_list.append(now_mean_reward)
        if (i+1) % 5 == 0:
            _total_value = draw_route(args, dqn_trainer, env, mean_cost_list, workdir)
            total_cost_list.append(_total_value)
    
    list_to_figure([total_cost_list], ['total_cost'], 'total_cost', f'{workdir}/dqn_total_cost_{args.problem}.png')
Пример #2
0
def train_dqn(dqn_trainer, env, args, workdir, suffix):
    action_space_size = env.action_space.n
    if not os.path.exists('train_log'):
        os.mkdir('train_log')
    writer = TensorBoard(f'train_log/{args.problem}_{args.run_name}')

    max_mean_reward = - 1000
    mean_cost_list = []
    total_cost_list = []
    total_valid_cost_list = []
    # min_route_cost = 10000000

    for i in range(args.iters):
        print(suffix)
        dqn_trainer.switch_mode(eval_mode=False)
        result = dqn_trainer.train(i)
        now_mean_reward = print_result( result, writer, i, dqn_trainer.policies_to_train, action_space_size )
        # if now_mean_reward > max_mean_reward:
        #     dqn_trainer.save(f"{args.problem}_{suffix}", i)
        mean_cost_list.append(now_mean_reward)
        if (i+1) % 5 == 0 or (now_mean_reward > max_mean_reward):
            reset_sequence = ((i+1) % args.sequence_update_freq == 0)
            dqn_trainer.switch_mode(eval_mode=True)
            _total_cost, _valid_route = draw_route(args, dqn_trainer, env, mean_cost_list, workdir, suffix, (now_mean_reward > max_mean_reward), reset_sequence)
            total_cost_list.append(_total_cost)
            if _valid_route:
                total_valid_cost_list.append(_total_cost)
            list_to_figure([total_cost_list], ['total_cost'], 'total_cost', f'{workdir}/dqn_total_cost_{args.problem}_{suffix}.png')
            if len(total_valid_cost_list) > 0:
                list_to_figure([total_valid_cost_list], ['total_valid_cost'], 'total_valid_cost', f'{workdir}/dqn_total_valid_cost_{args.problem}_{suffix}.png')
        max_mean_reward = max(max_mean_reward, now_mean_reward)
    return mean_cost_list
Пример #3
0
def draw_route(args, trainer, env, mean_cost_list, workdir, suffix, is_render):
    plt.figure(figsize=(30, 30))
    plt.axis("on")
    G, pos, route_edges, total_cost, valid_route = rl_solution_to_graph(
        trainer, env)
    if is_render:
        labels = {}
        for node in G.nodes():
            labels[node] = node
        nx.draw_networkx_nodes(G, pos, node_size=1000)
        nx.draw_networkx_labels(G,
                                pos,
                                labels,
                                font_size=30,
                                font_color="black")
        cmap = matplotlib.cm.get_cmap('Spectral')
        max_vehicle_id = np.max(list(route_edges.keys())) + 1.0
        for vehicle_id in route_edges.keys():
            if len(route_edges[vehicle_id]) <= 0:
                continue
            nx.draw_networkx_edges(G,
                                   pos,
                                   width=2,
                                   arrows=True,
                                   arrowsize=100,
                                   edgelist=route_edges[vehicle_id],
                                   edge_color=cmap(vehicle_id /
                                                   max_vehicle_id))

        plt.show()
        plt.savefig(f'{workdir}/dqn_vrp_{args.problem}_{suffix}.png')
    plt.close()
    list_to_figure([mean_cost_list], ['mean_cost'], 'mean_cost',
                   f'{workdir}/dqn_cost_{args.problem}_{suffix}.png')
    return total_cost, valid_route
Пример #4
0
def draw_route(args, trainer, env, mean_cost_list, workdir):
    items_in_pack = []
    total_value = 0.0
    total_weight = 0.0
    policy = trainer.get_policy()
    state = env.reset()
    for i in range(env.num_items):
        action, _, _ = policy.compute_single_action( state, info={}, explore=False )
        state, _, _, _ = env.step(action)
        if action == 1 and total_weight + env.weights[env.items_in_sequence[i]] <= env.capacity:
            items_in_pack.append(env.items_in_sequence[i])
            total_value += env.values[env.items_in_sequence[i]]
            total_weight += env.weights[env.items_in_sequence[i]]
    print('Total values = ', total_value, ', ortool values = ', env.get_ortool_value())
    print('Total weight = ', total_weight)
    print('Capacity = ', env.capacity)
    print('Packed items: ', items_in_pack)
    list_to_figure([mean_cost_list], ['mean_cost'], 'mean_cost', f'{workdir}/dqn_cost_{args.problem}.png')
    return total_value
Пример #5
0
def train_dqn(dqn_trainer, env, args, workdir, suffix):
    action_space_size = env.action_space.n
    if not os.path.exists('train_log'):
        os.mkdir('train_log')
    writer = TensorBoard(f'train_log/{args.run_name}')

    max_mean_reward = -1000
    mean_cost_list = []
    total_cost_list = []
    total_valid_cost_list = []
    min_route_cost = 10000000

    for i in range(args.iters):
        print(suffix)
        result = dqn_trainer.train(i)
        now_mean_reward = print_result(action_space_size, result, writer, i)
        if now_mean_reward > max_mean_reward:
            dqn_trainer.policy.save_param(f"{args.problem}_{suffix}_best")
        mean_cost_list.append(now_mean_reward)
        if (i + 1) % 5 == 0 or (now_mean_reward > max_mean_reward):
            _total_cost, _valid_route = draw_route(
                args,
                dqn_trainer,
                env,
                mean_cost_list,
                workdir,
                suffix,
                is_render=(now_mean_reward > max_mean_reward))
            total_cost_list.append(_total_cost)
            if _valid_route:
                total_valid_cost_list.append(_total_cost)
            list_to_figure(
                [total_cost_list], ['total_cost'], 'total_cost',
                f'{workdir}/dqn_total_cost_{args.problem}_{suffix}.png')
            if len(total_valid_cost_list) > 0:
                list_to_figure(
                    [total_valid_cost_list], ['total_valid_cost'],
                    'total_valid_cost',
                    f'{workdir}/dqn_total_valid_cost_{args.problem}_{suffix}.png'
                )
        max_mean_reward = max(max_mean_reward, now_mean_reward)
    return mean_cost_list
Пример #6
0
def train_ppo(args, env, knapsack_config, workdir, n_iterations):
    ext_conf = ppo.DEFAULT_CONFIG.copy()
    ext_conf.update({
            "num_workers": 2,
            "num_cpus_per_worker": 1,
            "vf_share_layers": True,
            "vf_loss_coeff": 1.0,      
            "vf_clip_param": 100.0,
            "use_critic": True,
            "use_gae": True,
            "framework": "torch",
            "lambda": 1.0,
            "gamma": 1.0,
            'env_config': knapsack_config,
            'timesteps_per_iteration': knapsack_config['episode_len'],
            'batch_mode': 'complete_episodes',
            # Size of batches collected from each worker
            "rollout_fragment_length": args.rollout,
            # Number of timesteps collected for each SGD round. This defines the size
            # of each SGD epoch.
            "train_batch_size": args.batch_size*args.rollout,
            # Total SGD batch size across all devices for SGD. This defines the
            # minibatch size within each epoch.
            "sgd_minibatch_size": args.min_batch_size*args.rollout,
            # Number of SGD iterations in each outer loop (i.e., number of epochs to
            # execute per train batch).
            "num_sgd_iter": 100,
            "shuffle_sequences": True,
            "lr": 1e-4,
            "_fake_gpus": True,
            "num_gpus": 0,
            "num_gpus_per_worker": 0,
            "model": {"custom_model": "knapsack_model"},
            "explore": True,
            # "exploration_config": {
            #     # The Exploration class to use.
            #     "type": "EpsilonGreedy",
            #     # Config for the Exploration class' constructor:
            #     "initial_epsilon": 1.0,
            #     "final_epsilon": 0.02,
            #     "epsilon_timesteps": args.rollout*args.batch_size*args.iters // 3,  # Timesteps over which to anneal epsilon.
            # },
            "exploration_config": {
                "type": StochasticSampling,
                "random_timesteps": args.rollout*args.batch_size*args.iters // 4,
            },
        })
    
    print(f"Environment: action space {env.action_space}, observation space {env.observation_space}")
    ppo_trainer = ppo.PPOTrainer(
        env = KnapsackEnv,
        config = ext_conf)
    
    # ppo_trainer.restore('/root/ray_results/PPO_CVRPEnv_2020-12-29_11-50-29uylrljyr/checkpoint_100/checkpoint-100')
    
    mean_cost_list = []
    total_cost_list = []
    for i in range(n_iterations):
        print("== Iteration", i, "==")
        trainer_result = ppo_trainer.train()
        print_training_results(trainer_result)
        # cost = env.total_cost - (trainer_result['episode_reward_mean']*env.total_cost) / trainer_result['episode_len_mean']
        # cost = (1.0 - trainer_result['episode_reward_mean']/trainer_result['episode_len_mean']) * env.max_cost * env.num_nodes
        cost = trainer_result['episode_reward_mean']
        mean_cost_list.append(cost)
        print('cost: ', cost)
        if (i+1) % 5 == 0:
            checkpoint = ppo_trainer.save()
            print("checkpoint saved at", checkpoint)
            _total_value = draw_route(args, ppo_trainer, env, mean_cost_list, workdir)
            total_cost_list.append(_total_value)
    list_to_figure([total_cost_list], ['total_cost'], 'total_cost', f'{workdir}/rl_knapsack_total_cost_{args.problem}.png')
    return ppo_trainer, mean_cost_list
Пример #7
0
# python ma_dp_dqn_vrp.py --iters 1000 --problem A-n64-k9 --training-step 512 --num-agents 9 --priori-memory 1 --run-name pm1 --episode 128
# python ma_dp_dqn_vrp.py --iters 3000 --problem A-n32-k5 --training-step 320 --num-agents 5 --run-name tt --episode 31 --mode dp

if __name__ == "__main__":
    args = parser.parse_args()
    vrp_config = env_config.copy()
    vrp_config.update({'problem': args.problem, 
                       "constraint_id": args.constraint_id,
                       "episode_len": args.episode})
    env = CVRPEnv(vrp_config)

    if args.pt:
        workdir = f"{os.environ['PT_OUTPUT_DIR']}/{args.problem}_{args.run_name}/"
    else:
        workdir = f"output/vrp/{args.problem}_{args.run_name}/"
    os.makedirs(workdir, exist_ok=True)
    

    metric_list = []
    metric_labels = []

    env.reset()
    env.is_constraint_imposed = False
    trainer_woc = create_trainer(env, args, workdir, 'woc')
    total_cost_list = train_dqn(trainer_woc, env, args, workdir, 'woc')
    metric_list.append(total_cost_list)
    metric_labels.append('mean_reward_without_constraint')

    list_to_figure(metric_list, metric_labels, 'mean reward of policies', f'{workdir}/dqn_reward_{args.problem}.png', smoothed=False)
Пример #8
0
def train_dqn(rank, total_num_process, models, env, args, workdir, dqn_config,
              suffix):
    torch.manual_seed(args.seed + rank)
    agent_policies = {}
    policies_to_train = []
    _env = CVRPEnv(env.config)
    for agent_id, vehicle_id in enumerate([0]):
        policy = DistDQNDPTorchPolicy(0, env.observation_space,
                                      env.action_space, dqn_config, models)
        agent_policies[vehicle_id] = policy
        policies_to_train.append(vehicle_id)
    dqn_trainer = Trainer(_env, agent_policies, policies_to_train, dqn_config)

    action_space_size = env.action_space.n
    # if not os.path.exists('train_log'):
    #     os.mkdir('train_log')
    # writer = TensorBoard(f'train_log/{args.problem}_{args.run_name}')

    max_mean_reward = -1000
    mean_cost_list = []
    total_cost_list = []
    total_valid_cost_list = []
    ortool_val_list = []
    cost_diff_list = []
    # pool = mp.Pool(8)

    for i in range(args.iters):
        print(f"===={suffix}=======iters: {i}======rank: {rank}===")
        dqn_trainer.switch_mode(eval_mode=False)
        result = dqn_trainer.train(i)
        if rank == total_num_process:
            now_mean_reward, _ = print_result(result, None, i,
                                              dqn_trainer.policies_to_train,
                                              action_space_size)
            # if now_mean_reward > max_mean_reward:
            #     dqn_trainer.save(f"{args.problem}_{suffix}", 'best')
            mean_cost_list.append(now_mean_reward)
            max_mean_reward = max(max_mean_reward, now_mean_reward)
        reset_sequence = False  # ((i+1) % args.sequence_update_freq == 0)
        if (rank == total_num_process) and ((i + 1) % args.render_freq == 0):
            dqn_trainer.switch_mode(eval_mode=True)
            tmp_total_cost_list = []
            tmp_total_valid_cost_list = []
            tmp_ortool_val_list = []
            for _ in range(args.eval_rounds):
                _total_cost, _valid_route, _ortool_val = draw_route(
                    args, dqn_trainer, env, mean_cost_list, workdir, suffix,
                    True, reset_sequence)
                tmp_total_cost_list.append(_total_cost)
                tmp_ortool_val_list.append(_ortool_val)
                if _valid_route:
                    tmp_total_valid_cost_list.append(_total_cost)

            total_cost_list.append(np.mean(tmp_total_cost_list))
            ortool_val_list.append(np.mean(tmp_ortool_val_list))

            if len(tmp_total_valid_cost_list) > 0:
                total_valid_cost_list.append(
                    np.mean(tmp_total_valid_cost_list))
                cost_diff_list.append(total_valid_cost_list[-1] -
                                      ortool_val_list[-1])
            elif len(total_valid_cost_list) > 0:
                total_valid_cost_list.append(total_valid_cost_list[-1])
            else:
                total_valid_cost_list.append(0.0)
            list_to_figure(
                [total_cost_list, ortool_val_list, total_valid_cost_list],
                ['total_cost', 'ortool_cost', 'total_valid_cost'],
                'total_cost',
                f'{workdir}/dqn_total_cost_{args.problem}_{suffix}.png')

            list_to_figure([mean_cost_list], ['mean_cost'], 'mean_cost',
                           f'{workdir}/dqn_cost_{args.problem}_{suffix}.png')
            if len(cost_diff_list) > 0:
                list_to_figure(
                    [cost_diff_list], ['cost_diff'], 'cost_diff',
                    f'{workdir}/dqn_cost_diff_{args.problem}_{suffix}.png')
        sys.stdout.flush()
    return mean_cost_list