示例#1
0
def run(args):
    if args.machine == "Mac":
        env = UnityEnvironment(file_name='./Reacher.app',seed=1)
    else :
        env = UnityEnvironment(file_name='./Reacher_Linux_NoVis/Reacher.x86_64',seed=1)

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else :
        device = torch.device('cpu')
    print("using device", device)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=False)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space 
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])
    #==========================my version=========================
    agent = Agent(a_dim=4, s_dim=33, clip_value=1, device=device) # continuous action clip
    agent.load("./pretrained/")
    eval(env, agent, brain_name)
    env.close()
示例#2
0
	def __init__(self, state_size, action_size, random_seed):
		super(MADDPG, self).__init__()

		self.state_size = state_size
		self.action_size = action_size
		self.random_seed = random_seed

		self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed)

		
		self.maddpg_agent = [Agent(self.state_size, self.action_size, BATCH_SIZE, self.random_seed, TAU, LR_ACTOR, LR_CRITIC, WEIGHT_DECAY, 0), 
							 Agent(self.state_size, self.action_size, BATCH_SIZE, self.random_seed, TAU, LR_ACTOR, LR_CRITIC, WEIGHT_DECAY, 1)]
		
		self.iter = 0
		self.episode_counter = 0

		self.eps = 2
		self.eps_decay = 0.9999
def collect_trajectories(env: gym.Env,
                         agent: Agent,
                         n_games: int = 10) -> np.ndarray:

    for _ in range(n_games):
        state = env.reset()
        done: bool = False
        state_history: list[np.ndarray] = []

        while not done:
            state_history.append(state)
            action = agent.choose_action(state)
            next_state, _, done, _ = env.step(action)
            state = next_state

    return np.vstack(state_history)
示例#4
0
from DRLEnv import FedEnv
from DDPG import Agent
from tqdm import tqdm, trange
import torch
import numpy as np
import pandas as pd
from collections import deque

if __name__ == '__main__':
    print(torch.cuda.is_available())
    epoches, print_every = 200, 100
    env = FedEnv(Client=5, k=2)  # env
    agent = Agent(state_size=25, action_size=25, random_seed=2)  # agent
    scores_deque = deque(maxlen=print_every)
    scores = []
    episode = []

    for i_episode in range(1, 200 + 1):
        X, Y = [], []  # x and y axis for test_data
        start_time = 0
        # initialize pca ?
        if i_episode == 0:
            state = env.reset(Tag=True)
        else:
            state = env.reset(Tag=False)

        # initialize agent's noise
        agent.reset()
        score = 0

        reward_y = []
if __name__ == '__main__':

    # Init. Environment
    env = gym.make('LunarLanderContinuous-v2')
    env.reset()

    # Init. Datapath
    data_path = os.path.abspath('Vanilla-DDPG/data')

    # Init. Testing
    n_games = 10
    test_data: List[Dict[str, np.ndarray]] = [] * n_games

    # Init. Agent
    agent = Agent(env=env, n_games=n_games, training=False)
    agent.load_models(data_path)

    for i in tqdm(range(n_games), desc=f'Testing', total=n_games):
        score_history: List[np.float32] = [] * n_games

        for _ in tqdm(range(n_games), desc=f'Testing', total=n_games):
            score = 0
            done = False

            # Initial Reset of Environment
            state = env.reset()

            while not done:
                action = agent.choose_action(state)
                next_state, reward, done, _ = env.step(action)
            next_state, _, done, _ = env.step(action)
            state = next_state

    return np.vstack(state_history)


if __name__ == "__main__":

    # Init. path
    data_path = os.path.abspath('Vanilla-DDPG/data')

    # Init. Environment and agent
    env = gym.make('LunarLanderContinuous-v2')
    env.reset()

    agent = Agent(env=env, training=False)
    agent.load_models(data_path)

    with open(os.path.join(data_path, 'training_info.json')) as f:
        train_data = json.load(f)

    with open(os.path.join(data_path, 'testing_info.json')) as f:
        test_data = json.load(f)

    # Load all the data frames
    score = [data["Epidosic Summed Rewards"] for data in train_data]
    average = [data["Moving Mean of Episodic Rewards"] for data in train_data]
    test = [data["Test Score"] for data in test_data]

    trajectory = collect_trajectories(env, agent)
def main():
    global render_bool
    render_bool = True
    # parl.connect('localhost:8037')
    if dummy_mode:
        render_bool = False
    if not render_bool:
        os.environ["SDL_VIDEODRIVER"] = "dummy"
    # else:
    #     pygame.display.set_mode((800, 600 + 60))
    # 创建环境
    game = GameEnv()
    p = PLE(game, display_screen=render_bool, fps=30, force_fps=True
            )  # , fps=30, display_screen=render_bool, force_fps=True)

    p.init()

    # 根据parl框架构建agent
    print(p.getActionSet())
    act_dim = len(p.getActionSet())
    width, height = p.getScreenDims()
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池
    obs_dim = get_env_obs(p).shape
    model = Model(act_dim=act_dim)
    if MODE == "DDPG":
        alg = RL_Alg(model,
                     gamma=GAMMA,
                     tau=0.001,
                     actor_lr=LEARNING_RATE,
                     critic_lr=LEARNING_RATE)
    if MODE == "DQN":
        alg = RL_Alg(model, gamma=GAMMA, lr=LEARNING_RATE, act_dim=act_dim)
    agent = Agent(alg, obs_dim=obs_dim,
                  act_dim=act_dim)  # e_greed有一定概率随机选取动作,探索

    # 加载模型
    best_eval_reward = -1000
    cache_fn = './model_pixelcopter_%s.ckpt' % MODE
    if os.path.exists(cache_fn):
        print("loaded model:", cache_fn)
        agent.restore(cache_fn)
        best_eval_reward = evaluate(p, agent, render=render_bool)
        # run_episode(env, agent, train_or_test='test', render=True)
        # exit()
    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(p, agent, rpm)

    max_episode = 200000
    # 开始训练
    episode = 0

    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 5):
            total_reward = run_episode(p, agent, rpm)
            episode += 1
        # test part
        eval_reward = evaluate(p, agent,
                               render=render_bool)  # render=True 查看显示效果
        logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
            episode, e_greed, eval_reward))

        # 保存模型到文件 ./model.ckpt
        agent.save(cache_fn + "." + str(rate_num))
        if best_eval_reward < eval_reward:
            best_eval_reward = eval_reward
            agent.save(cache_fn)
示例#8
0
import gym
import gym_pid
from DDPG import Agent
import numpy as np
import utils

env = gym.make('pid-v0')
agent = Agent(alpha=0.00001,
              beta=0.0001,
              input_dims=[3],
              tau=0.0001,
              env=env,
              batch_size=64,
              layer1_size=256,
              layer2_size=128,
              n_actions=3)

#agent.load_models()
# np.random.seed(1)

score_history = []
for i in range(50):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        act = agent.choose_action(obs)
        print(act)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
示例#9
0
continued = True
path = "path"
NOISE_C = 1.1
first_ep = 0

with tf.device('/GPU:0'):
    env = Environment("data/u20.txt", SEED)
    # env = gym.wrappers.Monitor(e.env, 'video/', video_callable=lambda episode_id: True,force = True)
    # video = VideoRecorder(env, "video.mp4"
    state_shape = env.state_shape
    action_len = env.action_shape[0]
    action_scale = None
    NOISE = 0.6
    # np.random.seed(SEED)

    agent = Agent(state_shape, action_len, action_scale)
    if continued:
        agent.load(path)
    agent.summary()

    for episode in range(first_ep, EPISODES):
        state = env.reset()
        state = np.reshape(state, state_shape)
        score = 0
        # print(state)
        # done = False
        noise = np.random.normal(NOISE, NOISE / 2,
                                 2) / (1 + pow(NOISE_C, episode + 10))
        for st in range(MAX_STEPS):
            # while not done :
            #     env.render()
示例#10
0
def main(env, episodes=500, max_steps=500, eps_decay=.99,
         actor_lr=10**-6, critic_lr=10**-3, gamma=.9, 
         base_nodes=64, batch_size=128,theta=.4, sigma=.25):

	with tf.Session() as sess:

		# Initialize environment and constants
		input_dim   = env.state_dim   
		output_dim  = env.action_dim  
		action_high = env.action_high 
		action_low  = env.action_low 

		# Create DDPG Agent
		agent = Agent(input_dim, output_dim, action_high, action_low, 
		              actor_lr=actor_lr, critic_lr=critic_lr, gamma=gamma, 
		              base_nodes=base_nodes, eps_decay=eps_decay,
		              batch_size=batch_size,theta=theta, sigma=sigma,
		              sess=sess)

		sess.run(tf.global_variables_initializer())
		agent.actor.update_target_network()
		agent.critic.update_target_network()

		# Prepare for episodes
		c_losses, rewards, actions, Qs, states = [np.array([]) for i in range(5)]

		for e in tqdm(range(episodes)):

			# Reset episode
			state = env.reset()
			state = np.reshape(state, (-1, len(state)))
			agent.noise.reset()

			done         = False
			step_count   = 0
			total_reward = 0

			while not done and step_count < max_steps:

				# Action
				action = agent.act(state)
				next_state, reward, done = env.step(action)
				next_state = np.reshape(next_state, (-1, len(next_state)))

				# Learn
				c_loss = agent.learn(state, action, reward, done, next_state)
				
				# Save results
				c_losses = np.append(c_losses, c_loss)
				actions  = np.append(actions, action)
				states   = np.append(states, state[0])
				Qs       = np.append(Qs, agent.critic.predict(state, action))
				
				# Loop
				state         = next_state
				step_count   += 1
				total_reward += reward

			# Reduce exploration
			if agent.eps > agent.min_eps:
				agent.eps *= agent.eps_decay

			rewards = np.append(rewards, total_reward)


		return rewards, c_losses, actions, Qs
    # Init. Environment
    env = gym.make('LunarLanderContinuous-v2')
    env.reset()

    # Init. Datapath
    data_path = os.path.abspath('Vanilla-DDPG/data')

    # Init. Training
    n_games: int = 1500
    best_score = -np.inf
    score_history: List[float] = [] * n_games
    avg_history: List[float] = [] * n_games
    logging_info: List[Dict[str, float]] = [] * n_games

    # Init. Agent
    agent = Agent(env=env, n_games=n_games)

    for i in range(n_games):
        score: float = 0.0
        done: bool = False

        # Initial Reset of Environment
        state = env.reset()

        while not done:
            action = agent.choose_action(state)

            next_state, reward, done, _ = env.step(action)
            agent.memory.add(state, action, reward, next_state, done)

            state = next_state