def main(): args = get_config() env = maze_env(args) args.input_size = 12 args.num_actions = 4 session = tf.Session() # TYPE SS: if args.exp_type == 'SS': agent_one = agent.selfish_agent(args, session, name_scope='Selfish_A') agent_two = agent.selfish_agent(args, session, name_scope='Selfish_B') elif args.exp_type == 'SN': agent_one = agent.naive_agent(args, session, name_scope='Naive_A') agent_two = agent.selfish_agent(args, session, name_scope='Selfish_B') elif args.exp_type == 'SP': agent_one = agent.punishment_agent(args, session, name_scope='Punish_A') agent_two = agent.selfish_agent(args, session, name_scope='Selfish_B') elif args.exp_type == 'AN': agent_one = agent.adaptive_agent(args, session, name_scope='Adaptive_A') agent_two = agent.naive_agent(args, session, name_scope='Naive_B') elif args.exp_type == 'AS': agent_one = agent.adaptive_agent(args, session, name_scope='Adaptive_A') agent_two = agent.selfish_agent(args, session, name_scope='Selfish_B') elif args.exp_type == 'AA': agent_one = agent.adaptive_agent(args, session, name_scope='Adaptive_A') agent_two = agent.adaptive_agent(args, session, name_scope='Adaptive_B') session.run(tf.global_variables_initializer()) for i_iteration in range(args.num_iteration): logger.info('Current Iteration {}'.format(i_iteration)) for _ in range(args.episode_per_batch): policy_rollout(env, agent_one, agent_two) agent_one.train_step() agent_two.train_step() agent_one.save_npy(args.task, agent_one._name_scope) agent_two.save_npy(args.task, agent_two._name_scope)
HIGH_INT_DIM = True N_SAMPLES = 200000 samples_transfer = 100 if __name__ == "__main__": logging.basicConfig(level=logging.INFO) # --- Parse parameters --- parameters = process_args(sys.argv[1:], Defaults) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() # --- Instantiate environment --- env = maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS) # --- Instantiate learning_algo --- learning_algo = CRAR(env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3)
logging.basicConfig(level=logging.INFO) # --- Parse parameters --- parameters = process_args(sys.argv[1:], Defaults) input_nnet = "normal_lr" + str(parameters.learning_rate) + "_lrd" + str(parameters.learning_rate_decay) if parameters.deterministic: rng = np.random.RandomState(parameters.seed) print(" deterministic, seed: ",parameters.seed) input_nnet = "normal_seed" + str(parameters.seed) + "_lr" + str(parameters.learning_rate) + "_lrd" + str(parameters.learning_rate_decay) else: rng = np.random.RandomState() if parameters.dumpname != "": input_nnet = parameters.dumpname print("input nnet= ", input_nnet) # --- Instantiate environment --- env = maze_env(rng, higher_dim_obs=parameters.high_dim_obs, show_game=False) # --- Instantiate learning_algo --- learning_algo = CRAR( env, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3, div_entrop_loss=1.) train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) # --- Instantiate agent --- agent = NeuralAgent(
np.random.seed(0) def epsilon_greedy(Q, state): #如果所有元素都为真,那么返回真;否则返回假 all = (Q[state, :] == 0).all() if (np.random.uniform() > 1 - EPSILON) or all: action = np.random.randint(0, 4) # 0~3 else: #返回每一个state行上的最大值的action下标 action = Q[state, :].argmax() return action e = maze_env() Q = np.zeros((e.state_num, 4)) # print(Q.shape) for i in range(200): e = maze_env() while (e.is_end is False) and (e.step < MAX_STEP): action = epsilon_greedy(Q, e.present_state) state = e.present_state reward = e.interact(action) new_state = e.present_state Q[state, action] = (1 - ALPHA) * Q[state, action] + \ ALPHA * (reward + GAMMA * Q[new_state, :].max()) print(Q) time.sleep(0.1) print('循环次数:', i, '总步数:', e.step, '总奖励数:', e.total_reward)
samples_transfer=100 if __name__ == "__main__": logging.basicConfig(level=logging.INFO) # --- Parse parameters --- parameters = process_args(sys.argv[1:], Defaults) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() # --- Instantiate environment --- # env = maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS) env = maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS, reverse=True) # --- Instantiate learning_algo --- learning_algo = CRAR( env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3,
if parameters.deterministic: rng = np.random.RandomState(parameters.seed) print(" deterministic, seed: ",parameters.seed) input_transferred = "transferred_seed" + str(parameters.seed) + "_lr" + str(parameters.learning_rate) + "_lrd" + str(parameters.learning_rate_decay) input_normal = "normal_seed" + str(parameters.seed) + "_lr" + str(parameters.learning_rate) + "_lrd" + str(parameters.learning_rate_decay) else: rng = np.random.RandomState() if parameters.dumpname != "": input_transferred = parameters.dumpname + "_transferred" input_normal = parameters.dumpname print("input transferred nnt=", input_transferred) print("input normal nnet= ", input_normal) # --- Instantiate environment --- env = maze_env(rng, higher_dim_obs=parameters.high_dim_obs) # --- Instantiate learning_algo --- learning_algo = CRAR( env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3,
fname += "_lr" + str(parameters.learning_rate) + "_lrd" + str(parameters.learning_rate_decay) if parameters.mode == 1: fname += "_resetencoder" if parameters.mode == 2: fname += "_partialfreezeencoder" # if parameters.mode == 3: # #TODO dont freeze but very small lr for others models print("saving nnet,plot and score under name=", fname) print("input nnet= ", input_nnet) # --- Instantiate environment --- # env = maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS) env = maze_env(rng, higher_dim_obs=parameters.high_dim_obs, reverse=True) # --- Instantiate learning_algo --- learning_algo = CRAR( env, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_norm, parameters.freeze_interval, parameters.batch_size, parameters.update_rule, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3,