def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count(actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env(opt.layout)#,"{}/video_{}.mp4".format(opt.output_path, opt.layout)) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): model.load_state_dict(torch.load("{}/gym-pacman_{}".format(opt.saved_path,opt.layout))) model.cuda() else: model.load_state_dict(torch.load("{}/gym-pacman_{}".format(opt.saved_path, opt.layout), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) done = True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) env.reset() else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render()
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type, f"{opt.output_path}/video_{opt.world}_{opt.stage}.mp4") model = ActorCritic(num_states, num_actions) model.load_state_dict(torch.load(f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}", map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) done = True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) env.reset() else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() if info["flag_get"]: print(f"World {opt.world} stage {opt.stage} completed") break
def test_a3c(index, args, A3C_shared_model, CAE_shared_model): #load CAE models weights into this project CAE_shared_model.load_state_dict(torch.load( args.pretrained_model_weights_path), strict=False) CAE_shared_model.eval() torch.manual_seed(123 + index) #instantiate the environment env, num_states, num_actions = build_environment(args.world, args.stage) #initialize CAE worker modell CAE_local_model = Convolutional_AutoEncoder() #load weights CAE_local_model.load_state_dict(torch.load( args.pretrained_model_weights_path), strict=False) CAE_local_model.eval() #initialize A3C part of the model a3c_local_model = ActorCritic(num_states, num_actions) a3c_local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=args.max_actions) # this loop runs until number of steps expire while True: curr_step += 1 if done: a3c_local_model.load_state_dict(A3C_shared_model.state_dict()) with torch.no_grad(): if done: hx = torch.zeros((1, 512), dtype=torch.float) cx = torch.zeros((1, 512), dtype=torch.float) else: hx = hx.detach() cx = cx.detach() #send state output as input into CAE outputs_cae = CAE_local_model(state) #CAE output into A3C model as input #in return receive the policy, value, and memories and then choose action logits, value, hx, cx = a3c_local_model(outputs_cae, hx, cx) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() #use the chosen action and make step state, reward, done, _ = env.step(action) env.render() actions.append(action) # if finished or defeated, finish the test if curr_step > args.max_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def local_test(index, opt, global_model, start_time, curr_episode): info = {} info["flag_get"] = False torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True and info["flag_get"] == False: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state) if info["flag_get"]: print("完成") end_time = timeit.default_timer() config_state = { 'net': global_model.state_dict(), 'curr_episode': curr_episode, 'time': end_time - start_time, } torch.save( config_state, "{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage)) return True else: env.close() return False
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env( opt.world, opt.stage, opt.action_type, "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): model_dict = torch.load("{}/a3c_super_mario_bros_{}_{}".format( opt.saved_path, opt.world, opt.stage)) model.load_state_dict(model_dict['net']) model.cuda() print("episode", model_dict['curr_episode']) print("time", model_dict['time']) else: model_dict = torch.load("{}/a3c_super_mario_bros_{}_{}".format( opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage) model.load_state_dict(model_dict['net']) print("episode", model_dict['curr_episode']) print("time", model_dict['time']) model.eval() state = torch.from_numpy(env.reset()) done = True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) env.reset() else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) state = torch.from_numpy(state) env.render() if info["flag_get"]: print("World {} stage {} completed".format(opt.world, opt.stage)) break
def test(opt): #torch.manual_seed(123) if not os.path.isdir(opt.output_path): os.makedirs(opt.output_path) env, num_states, num_actions = create_train_env(1, opt, "{}/test.mp4".format(opt.output_path)) model = ActorCritic(num_states, num_actions) if opt.use_gpu and torch.cuda.is_available(): model.load_state_dict(torch.load("{}/a3c".format(opt.resume_path))) model.cuda() else: model.load_state_dict(torch.load("{}/a3c".format(opt.resume_path), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset(False, False, True)) round_done, stage_done, game_done = False, False, True num_action = 0 while True: if round_done or stage_done or game_done: h_0 = torch.zeros((1, 256), dtype=torch.float) c_0 = torch.zeros((1, 256), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) num_action += 1 state, reward, round_done, stage_done, game_done = env.step(action) state = torch.from_numpy(state) if round_done or stage_done: state = torch.from_numpy(env.reset(round_done, stage_done, game_done)) if game_done or num_action == opt.max_steps: env.make_anim() print("Game over") break
def test(opt): torch.manual_seed(123) env, num_states, num_actions = create_train_env( 1, "{}/video.mp4".format(opt.output_path)) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): model.load_state_dict( torch.load("{}/a3c_street_fighter".format(opt.saved_path))) model.cuda() else: model.load_state_dict( torch.load("{}/a3c_street_fighter".format(opt.saved_path), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset(False, False, True)) round_done, stage_done, game_done = False, False, True while True: if round_done or stage_done or game_done: h_0 = torch.zeros((1, 1024), dtype=torch.float) c_0 = torch.zeros((1, 1024), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, round_done, stage_done, game_done = env.step(action) state = torch.from_numpy(state) if round_done or stage_done: state = torch.from_numpy( env.reset(round_done, stage_done, game_done)) if game_done: print("Game over") break
def test(opt): viewer = rendering.SimpleImageViewer() viewer.width = 800 * 2 viewer.height = 600 * 2 #1920x1080 viewer.window = pyglet.window.Window(width=viewer.width, height=viewer.height, resizable=True) torch.manual_seed(123) if opt.output_path != None: env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type, "{}/video_{}_{}.mp4".format(opt.output_path, opt.world, opt.stage)) else: env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type,None) model = ActorCritic(num_states, num_actions) if torch.cuda.is_available(): model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))) model.cuda() else: model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), map_location=lambda storage, loc: storage)) model.eval() state = torch.from_numpy(env.reset()) done = True max_x_pos = 0 max_x_pos_counter = 0 while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) print('done') max_x_pos = 0 max_x_pos_counter = 0 env.reset() done = False else: h_0 = h_0.detach() c_0 = c_0.detach() if torch.cuda.is_available(): h_0 = h_0.cuda() c_0 = c_0.cuda() state = state.cuda() logits, value, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() action = int(action) state, reward, done, info = env.step(action) rgb = env.render('rgb_array') state = torch.from_numpy(state) viewer.imshow(rgb) if max_x_pos_counter < 50: time.sleep(0.06) if reward < 0: max_x_pos_counter += 1 if max_x_pos_counter > 150: print('no progress, stopping') done = True if info["flag_get"]: print("World {} stage {} completed".format(opt.world, opt.stage)) done = True copyfile("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), "{}/a3c_super_mario_bros_{}_{}_{}".format(opt.saved_path, info["world"], info["stage"],random.random())) print(reward,COMPLEX_MOVEMENT[action]) print('done testing')
def shared_learn(args): os.environ['OMP_NUM_THREADS'] = '1' torch.manual_seed(123) # create path for logs if os.path.isdir(args.sum_path): shutil.rmtree(args.sum_path) os.makedirs(args.sum_path) if not os.path.isdir(args.trained_models_path): os.makedirs(args.trained_models_path) mp = _mp.get_context('spawn') # create initial mario environment env, num_states, num_actions = build_environment(args.world, args.stage) print('Num of states: {}'.format(num_states)) #4 print('environment: {}'.format(env)) print('Num of actions: {}'.format(num_actions)) #12 # check if cuda is available else cpu device = torch.device('cuda' if ( args.use_cuda and torch.cuda.is_available()) else 'cpu') CAE_shared_model = Convolutional_AutoEncoder() #.to(device) A3C_shared_model = ActorCritic(num_states, num_actions) #.to(device) # if a new stage, then it picks up previous saved model if args.new_stage: A3C_shared_model.load_state_dict( torch.load('{}/a3c_super_mario_bros_{}_{}_enc2'.format( args.world, args.stage, args.trained_models_path))) A3C_shared_model.eval() # GPU check if (args.use_cuda and torch.cuda.is_available()): A3C_shared_model.cuda() CAE_shared_model.cuda() # shares memory with worker instances CAE_shared_model.share_memory() A3C_shared_model.share_memory() print('A3C') print(A3C_shared_model) # intialize optimizer optimizer_cae = CAE_shared_model.createLossAndOptimizer( CAE_shared_model, 0.001) optimizer_a3c = SharedAdam(A3C_shared_model.parameters(), lr=args.lr) #optimizer.share_memory() # processes workers = [] # start train process (run for the set number of workers) for rank in range(args.num_processes): if rank == 0: worker = mp.Process(target=train_a3c, args=(rank, args, optimizer_a3c, A3C_shared_model, CAE_shared_model, optimizer_cae, True)) else: worker = mp.Process(target=train_a3c, args=(rank, args, optimizer_a3c, A3C_shared_model, CAE_shared_model, optimizer_cae, True)) worker.start() worker.append(worker) # test worker worker = mp.Process(target=test_a3c, args=(rank, args, A3C_shared_model, CAE_shared_model)) worker.start() workers.append(worker) # join all processes for worker in workers: worker.join()