예제 #1
0
    def train(self):
        args = self.args
        torch.manual_seed(args.seed)
        env = env = grid2op.make(args.env_name,
                                 test=args.for_test,
                                 reward_class=L2RPNReward)
        shared_model = ActorCritic(env.observation_space.size(),
                                   self.action_space, args.hidden_size)
        shared_model.share_memory()

        if args.no_shared:
            optimizer = None
        else:
            optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                            lr=args.lr)
            optimizer.share_memory()

        processes = []

        counter = mp.Value('i', 0)
        lock = mp.Lock()

        p = mp.Process(target=self.do_test,
                       args=(args.num_processes, args, shared_model, counter))
        p.start()
        processes.append(p)

        for rank in range(0, args.num_processes):
            p = mp.Process(target=self.do_train,
                           args=(rank, args, shared_model, counter, lock,
                                 optimizer))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
    def __init__(self, args_, logger_):
        self.args = args_
        self.logger = logger_
        self.env = AtariEnv(gym.make(self.args.game),
                            args_.frame_seq,
                            args_.frame_skip,
                            render=True)
        self.shared_model = A3CLSTMNet(self.env.state_shape,
                                       self.env.action_dim)
        self.shared_model.share_memory()
        self.optim = my_optim.SharedAdam(self.shared_model.parameters(),
                                         lr=self.args.lr)
        self.optim.share_memory()
        # visdom
        self.vis = visdom.Visdom()
        self.main_update_step = Value('d', 0)
        # load model
        if self.args.load_weight != 0:
            self.load_model(self.args.load_weight)

        self.jobs = []
        if self.args.t_flag:
            for process_id in xrange(self.args.jobs):
                job = A3CSingleProcess(process_id, self, logger_)
                self.jobs.append(job)
        self.test_win = None
예제 #3
0
def main():
    #env
    args = config()
    mp.set_start_method("spawn")
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    env = create_atari_env(args.env_name)
    shared_model = AcotrCritic(env.observation_space.shape[0],
                               env.action_space)
    shared_model.share_memory()

    optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
    optimizer.share_memory()

    processes = []

    counter = mp.Value('i', 0)
    lock = mp.Lock()

    p = mp.Process(target=test,
                   args=(args.num_processes, args, shared_model, counter,
                         "./log/"))
    p.start()
    processes.append(p)

    for rank in range(0, args.num_processes):
        p = mp.Process(target=train,
                       args=(rank, args, shared_model, counter, lock,
                             optimizer))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
예제 #4
0
class Params():
    def __init__(self):
        self.lr = 0.0001
        self.gamma = 0.99
        self.tau = 1.
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'


# Main run
os.environ['OMP_NUM_THREADS'] = '1'
params = Params()
torch.manual_seed(params.seed)
env = create_atari_env(params.env_name)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr)
optimizer.share_memory()
processes = []
p = mp.Process(target=test, args=(params.num_processes, params, shared_model))
p.start()
processes.append(p)
for rank in range(0, params.num_processes):
    p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
    p.start()
    processes.append(p)
for p in processes:
    p.join()
예제 #5
0
    loader = DataLoader(opt)  # not used in training procedure, just used to set vocab_size and seq_length
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    model = models.setup(opt)
    model.train()
    num_parameter = get_num_params(model)
    print('number of parameters: ' + str(num_parameter))

    if opt.async_opt:
        if opt.use_cuda:
            model.cuda()
        model.share_memory()
        optimizer = my_optim.SharedAdam(model.parameters(),
                                        lr=opt.optim_lr,
                                        betas=(opt.optim_adam_beta1, opt.optim_adam_beta2),
                                        weight_decay=opt.optim_weight_decay)
        optimizer.share_memory()
        processes = []
        for rank in range(opt.num_processes):
            p = mp.Process(target=train, args=(rank, model, opt, optimizer))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        if opt.use_cuda:
            model.cuda()
        rank = 0
        optimizer = None
예제 #6
0
    shared_model.share_memory()

    if not args.no_curiosity:
        # <---ICM---
        shared_curiosity = IntrinsicCuriosityModule(
            # env.observation_space.shape[0], env.action_space)
            args.num_stack,
            env.action_space)
        shared_curiosity.share_memory()
        # ---ICM--->

    if args.no_shared:
        optimizer = None
    else:
        if args.no_curiosity:
            optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                            lr=args.lr)
        elif not args.no_curiosity:
            if not args.curiosity_only:
                optimizer = my_optim.SharedAdam(  # ICM
                    chain(shared_model.parameters(),
                          shared_curiosity.parameters()),
                    lr=args.lr)
            elif args.curiosity_only:
                optimizer = my_optim.SharedAdam(shared_curiosity.parameters(),
                                                lr=args.lr)
        optimizer.share_memory()

    if (args.model_file is not None) and (args.optimizer_file is not None):
        logging.info("Start with a pretrained model")
        shared_model.load_state_dict(torch.load(args.model_file))
        optimizer.load_state_dict(torch.load(args.optimizer_file))
def main(method):
    args = built_parser(method=method)
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  #+ sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
        for i in range(args.num_learners):
            #device = torch.device("cuda")
            device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
예제 #8
0
        if args.planning:
            d_module = load_d_module(env.action_space.shape[0], args)

        shared_model = R_Module(env.action_space.shape[0],
                                args.dim,
                                discrete=args.discrete,
                                baseline=args.baseline,
                                state_space=env.observation_space.shape[0])

        # shared reward module for everyone
        shared_model.share_memory()

        if args.no_shared:
            optimizer = None
        else:
            optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                            lr=args.lr)
            optimizer.share_memory()

        processes = []

        train_agent_method = None

        total_args = args
        train_agent_method = train_rewards

        for rank in range(0, args.num_processes):
            if rank == 0:
                p = mp.Process(target=train_agent_method,
                               args=(rank, total_args, shared_model, enc,
                                     optimizer, tb_log_dir, d_module))
            else:
예제 #9
0
def main(method):

    params = {
        'obs_size': (160, 100),  # screen size of cv2 window
        'dt': 0.025,  # time interval between two frames
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'task_mode':
        'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'train',
        'max_time_episode': 100,  # maximum timesteps per episode
        'desired_speed': 15,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    args = built_parser(method=method)
    env = gym.make(args.env_name, params=params)
    state_dim = env.state_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()
    num_cpu = mp.cpu_count()
    print(state_dim, action_dim, action_high, num_cpu)

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  # + sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    print("Network inited")

    if args.code_model == "eval":
        actor1.load_state_dict(
            torch.load('./' + args.env_name + '/method_' + str(args.method) +
                       '/model/policy_' + str(args.max_train) + '.pkl'))
    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    print("Network set")

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    print("Network loaded!")

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    print("Optimizer done")

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_learners):
            if i % 2 == 0:
                device = torch.device("cuda:1")
            else:
                device = torch.device("cuda:0")
            # device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
예제 #10
0
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'


# Main run
os.environ['OMP_NUM_THREADS'] = '1'  # 1 thread per core
params = Params()  # creating the params object from the Params class, that sets all the model parameters
torch.manual_seed(params.seed)  # setting the seed (not essential)
env = create_atari_env(params.env_name)  # we create an optimized environment thanks to universe
shared_model = ActorCritic(env.observation_space.shape[0],
                           env.action_space)  # shared_model is the model shared by the different agents (different threads in different cores)
shared_model.share_memory()  # storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory even if they are in different cores
optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                lr=params.lr)  # the optimizer is also shared because it acts on the shared model
optimizer.share_memory()  # same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory to optimize the model
processes = []  # initializing the processes with an empty list
p = mp.Process(target=test, args=(params.num_processes, params,
                                  shared_model))  # allowing to create the 'test' process with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread
p.start()  # starting the created process p
processes.append(p)  # adding the created process p to the list of processes
for rank in range(0,
                  params.num_processes):  # making a loop to run all the other processes that will be trained by updating the shared model
    p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
    p.start()
    processes.append(p)
for p in processes:  # creating a pointer that will allow to kill all the threads when at least one of the threads, or main.py will be killed, allowing to stop the program safely
    print('working')
    p.join()
예제 #11
0
parser.add_argument('--test', action='store_true',
                    help='test ')
parser.add_argument('--feature', type=int, default=96, 
                    help='features num')


if __name__ == '__main__':
    args = parser.parse_args()
    os.environ['OMP_NUM_THREADS'] = '1'
    torch.manual_seed(args.seed)

    num_inputs = args.feature
    num_actions = 9

    ac_net = ActorCritic(num_inputs, num_actions)
    opt_ac = my_optim.SharedAdam(ac_net.parameters(), lr=args.lr)

    if args.resume:
        print("=> loading checkpoint ")
        checkpoint = torch.load('../models/kankan/best.t7')
        #args.start_epoch = checkpoint['epoch']
        #best_prec1 = checkpoint['best_prec1']
        ac_net.load_state_dict(checkpoint['state_dict'])
        #opt_ac.load_state_dict(checkpoint['optimizer'])
        print(ac_net)
        print("=> loaded checkpoint  (epoch {})"
                .format(checkpoint['epoch']))

    ac_net.share_memory()
    #opt_ac = my_optim.SharedAdam(ac_net.parameters(), lr=args.lr)
    opt_ac.share_memory()
예제 #12
0
operative_temp = [all_parameter[8]] + all_parameter[10:21]
cost_flex = all_parameter[2:8] + [all_parameter[9]]
state_num = all_parameter[10:109] + all_parameter[0:6] + [
    all_parameter[7]
] + predictionFlat(params.file_path_prediction, (time_step_update) % 8760)

state = np.array(state_normalization(params.file_path_norm, state_num))
state = torch.from_numpy(state).float()

cx = torch.zeros(1, params.hidden_layer
                 )  # the cell states of the LSTM are reinitialized to zero
hx = torch.zeros(1, params.hidden_layer
                 )  # the hidden states of the LSTM are reinitialized to zero

model = ActorCritic(178, params.output_space)
optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr)

value, action_values, (hx, cx) = model(
    (state.unsqueeze(0), (hx, cx))
)  # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
prob = F.softmax(
    action_values, dim=1
)  # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
log_prob = F.log_softmax(
    action_values, dim=1
)  # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
entropy = -(log_prob * prob).sum(1)  # H(p) = - sum_x p(x).log(p(x))
action = prob.multinomial(
    1
).data  # selecting an action by taking a random draw from the prob distribution
log_prob = log_prob.gather(
예제 #13
0
        self.env_name = 'Pendulum-v0'


if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'
    params = Params()
    torch.manual_seed(params.seed)
    env = gym.make(params.env_name)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]

    shared_p = Policy(num_inputs, num_outputs)
    shared_v = Value(num_inputs)
    shared_p.share_memory()
    shared_v.share_memory()
    optimizer_p = my_optim.SharedAdam(shared_p.parameters(), lr=params.lr)
    optimizer_v = my_optim.SharedAdam(shared_v.parameters(), lr=params.lr)

    processes = []
    p = mp.Process(target=test, args=(params.num_processes, params, shared_p))
    p.start()
    processes.append(p)
    for rank in range(0, params.num_processes):
        p = mp.Process(target=train,
                       args=(rank, params, shared_p, shared_v, optimizer_p,
                             optimizer_v))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
예제 #14
0
    cx = torch.zeros(1, 256)
    hx = torch.zeros(1, 256)
    state = env.reset()
    state = torch.from_numpy(state)

    # <---ICM---
    shared_curiosity = IntrinsicCuriosityModule2(args.num_stack,
                                                 env.action_space,
                                                 args.epsilon)
    shared_curiosity.share_memory()
    # ---ICM--->

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_curiosity.parameters(),
                                        lr=args.lr)
        optimizer.share_memory()

    if args.curiosity_file is not None:
        logging.info("Load curiosity")
        shared_curiosity.load_state_dict(torch.load(args.curiosity_file),
                                         strict=False)

    if args.optimizer_file is not None:
        logging.info("Load optimizer")
        optimizer.load_state_dict(torch.load(args.optimizer_file))

    if args.new_curiosity:
        logging.info("Bayesian curiosity")

    processes = []