def testing(): gamma = 0.99 batch_sz = 32 num_episodes = 1000 total_t = 0 episode_rewards = np.zeros(num_episodes) last_100_avgs = [] full_msg = '' model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="model") with tf.Session() as sess: model.set_session(sess) sess.run(tf.global_variables_initializer()) model.load() obs = env.reset() state = obs acc = [1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2] for i in tqdm(range(int(10e6))): action = np.random.choice(K) obs, reward, done, _, full_msg = env.step(action, full_msg) #time.sleep(0.8) #print(action) if done == 1: done = True else: done = False next_state = obs if done: obs = env.reset() state = obs else: state = next_state
def train_dqn(episode): action_dict = {0: 'left', 1: 'down', 2: 'right', 3: 'up', 4: 'stay'} loss = [] agent = DQN(5, 10) for e in range(episode): print("Episode {}".format(e)) state = env.reset() state = np.reshape(state, (1, 10)) score = 0 max_steps = 1000 full_msg = '' for i in tqdm(range(max_steps)): action = agent.act(state) for i in range(11): reward, next_state, done, full_msg = env.step(action, full_msg) #time.sleep(2) score += reward next_state = np.reshape(next_state, (1, 10)) agent.remember(state, action, reward, next_state, done) state = next_state agent.replay() if done: print("") print("episode: {}/{}, score: {}".format(e, episode, score)) time.sleep(2) break loss.append(score) return loss
def train_dqn(episode): loss = [] agent = DQN(4, 10) init = tf.global_variables_initializer() with tf.Session() as sess: agent.set_session(sess) for e in range(episode): print("Episode {}".format(e)) state = env.reset() state = np.reshape(state, (1, 10)) score = 0 max_steps = 1000 for i in tqdm(range(max_steps)): action = agent.act(state) for i in range(11): reward, next_state, done = env.step(action) score += reward next_state = np.reshape(next_state, (1, 10)) agent.remember(state, action, reward, next_state, done) state = next_state agent.replay() if done: print("") print("episode: {}/{}, score: {}".format(e, episode, score)) time.sleep(2) break loss.append(score) return loss
def run(): step_of_each_round = [] for i in range(round): t1 = datetime.datetime.now() print("round :", i) observation = env.reset() # env.plt('start') step = 0 while True: # choose action action = [] for j in range(8): action.append(RL[j].choose_action(observation)) # update environment observation_, reward, done = env.step(action) # 训练一段时间后,更新画面 # if step > 10000: # env.plt('update') # restore memory for j in range(8): RL[j].store_transition(observation, action[j], reward[j], observation_) if (step > 200) and (step % 5 == 0): for j in range(8): RL[j].learn() if done: break observation = observation_ step = step + 1 step_of_each_round.append(step) t2 = datetime.datetime.now() print(t2 - t1) end = datetime.datetime.now() print(end - start) # output data csvFile = open('./data.csv', "a", newline='') data = RL[0].layers data.append(sum(step_of_each_round) / round) data.append(sum(step_of_each_round[-51:-1]) / 50) writer = csv.writer(csvFile, dialect='excel') writer.writerow(data) csvFile.close() print('average step: ', sum(step_of_each_round) / round) print('average step of latest 50 rounds: ', sum(step_of_each_round[-51:-1]) / 50) plt.plot(step_of_each_round) plt.pause(0)
def run(): step_of_each_round = [] for i in range(round): print(i) observation = env.reset() env.plt('start') step = 0 while True: observation_of_agent = [] observation_of_agent_ = [] for j in range(7): observation_of_agent.append(observation[j + 1] - observation[j]) observation_of_agent.append(observation[0] - observation[7]) action_list = np.array([[0., 0.], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) # choose action action = [] for j in range(8): action.append(RL[j].choose_action(observation_of_agent[j])) if np.linalg.norm(env.E[j]) > 0.1: action_list[j] = vel[action[j]] # update environment observation_, reward, done = env.step(action_list) for j in range(7): observation_of_agent_.append(observation_[j + 1] - observation_[j]) observation_of_agent_.append(observation_[0] - observation_[7]) if i > 50: env.plt('update') # restore memory for j in range(8): RL[j].store_transition(observation_of_agent[j], action[j], reward[j], observation_of_agent_[j]) if (step > 200) and (step % 5 == 0): for j in range(8): RL[j].learn() if done: # env.plt('finish') # RL[1].plot_cost() env.plt('clean') break step = step + 1 step_of_each_round.append(step) plt.ioff() for i in range(8): RL[i].plot_cost() plt.pause(5) print(sum(step_of_each_round) / round) plt.plot(step_of_each_round) plt.pause(0)
def testing(): gamma = 0.99 batch_sz = 32 num_episodes = 1000 total_t = 0 episode_rewards = np.zeros(num_episodes) last_100_avgs = [] model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="model") with tf.Session() as sess: model.set_session(sess) sess.run(tf.compat.v1.global_variables_initializer()) model.load() state = env.reset() acc = [1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2] sum_reward = 0 for i in range(int(10e6)): #if i % 4 == 0: # action = model.sample_action(state, 0.1) #else: action = model.sample_action(state, 0.1) #action = acc[i%len(acc)] next_state, reward, done, _ = env.step(action) time.sleep(0.8) #print(action) done = done == 1 sum_reward += reward if done: obs = env.reset() print("Reward: {}".format(sum_reward)) sum_reward = 0 state = obs else: state = next_state
from args import args from env import env from exploration import decay_exploration, epsilon, epsilon_greedy from model import Q from replay_buffer import replay_buffer from train import criterion, train optimizer = optim.Adam(Q.parameters(), lr=args.lr) # TODO wrap data in dataset for i in range(args.iterations): done = False s = env.reset() # TODO fold into rollout while not done: epsilon = decay_exploration(i, epsilon) a = epsilon_greedy(s, epsilon=epsilon) succ, r, done, _ = env.step(a) replay_buffer.append([s, a, r, succ, done]) s = succ if i % args.batch_size == 0 and i > 0 and len( replay_buffer) >= args.batch_size:
actions[0][1] = 1 if (keys[pygame.K_UP]): actions[0][2] = 1 if (keys[pygame.K_RCTRL]): actions[0][3] = 1 else: actions[0] = env.randomAction()[0] if (humanPlayer2): if (keys[pygame.K_a]): actions[1][0] = 1 if (keys[pygame.K_d]): actions[1][1] = 1 if (keys[pygame.K_w]): actions[1][2] = 1 if (keys[pygame.K_LCTRL]): actions[1][3] = 1 else: actions[1] = env.randomAction()[1] state = env.Step(actions) if (state[-1]): env.reset()
def trainning(): num_action_act = [0, 0, 0, 0, 0] gamma = 0.99 batch_sz = 32 num_episodes = 1000000 total_t = 0 experience_replay_buffer = [] episode_rewards = np.zeros(num_episodes) last_100_avgs = [] full_msg = '' epsilon = 1.0 epsilon_min = 0.1 epsilon_change = (epsilon - epsilon_min) / 1000000 #500000 model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="model") target_model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="target_model") with tf.Session() as sess: model.set_session(sess) target_model.set_session(sess) sess.run(tf.global_variables_initializer()) model.load() print("Filling experience replay buffer...") obs = env.reset() state = obs #for i in range(MIN_EXPERIENCES): for i in tqdm(range(MIN_EXPERIENCES)): action = np.random.randint(0, K) num_action_act[action] += 1 obs, reward, done, _, full_msg = env.step(action, full_msg) #time.sleep(0.5) #print(obs) if done == 1: done = True else: done = False next_state = obs experience_replay_buffer.append( (state, action, reward, next_state, done)) if done: obs = env.reset() state = obs else: state = next_state print(num_action_act) for i in range(num_episodes): t0 = datetime.now() obs = env.reset() state = obs loss = None total_time_training = 0 num_steps_in_episode = 0 episode_reward = 0 done = False while True: #for _ in range(0, MAX_STEP): if total_t % TARGET_UPDATE_PERIOD == 0: target_model.copy_from(model) print( "Copied model parameters to target network, total_t = %s, period = %s" % (total_t, TARGET_UPDATE_PERIOD)) action = model.sample_action(state, epsilon) num_action_act[action] += 1 time_act = datetime.now() obs, reward, done, _, full_msg = env.step(action, full_msg) time_act = datetime.now() - time_act if done == 1: done = True else: done = False next_state = obs episode_reward += reward if len(experience_replay_buffer) == MAX_EXPERIENCES: experience_replay_buffer.pop(0) experience_replay_buffer.append( (state, action, reward, next_state, done)) t0_2 = datetime.now() loss = learn(model, target_model, experience_replay_buffer, gamma, batch_sz) dt = datetime.now() - t0_2 #Confirm ''' if time_act > dt: print("Java timeout") else: print("Python timeout") ''' total_time_training += dt.total_seconds() num_steps_in_episode += 1 state = next_state total_t += 1 epsilon = max(epsilon - epsilon_change, epsilon_min) if done: break duration = datetime.now() - t0 episode_rewards[i] = episode_reward time_per_step = total_time_training / num_steps_in_episode last_100_avg = episode_rewards[max(0, i - 100):i].mean() last_100_avgs.append(last_100_avg) #print(i) #print("last 100: ",last_100_avg) #print("reward ",episode_reward) #print("rewards ",episode_rewards) #print("") print("Episode:", i, "Duration:", duration, "Num steps:", num_steps_in_episode, "Reward:", episode_reward, "Training time per step:", "%.3f" % time_per_step, "Avg Reward (last 100):", "%.3f" % last_100_avg, "Epsilon:", "%.3f" % epsilon) if i % 50 == 0: model.save(i) sys.stdout.flush() if np.sum(num_action_act) > 5e6: break plt.plot(last_100_avgs) plt.xlabel('episodes') plt.ylabel('Average Rewards') # plt.show() plt.savefig('result.png') print(num_action_act)
q_table = defaultdict(lambda: defaultdict(float)) import random # from IPython.display import clear_output # Hyperparameters alpha = 0.1 gamma = 0.7 epsilon = 0.4 # For plotting metrics all_epochs = [] all_penalties = [] starttime = datetime.now() for i in range(1, 90001): state_to_process = env.reset() #print("RESET", state_to_process, env.action_space.spaces[1].n) epochs, penalties, reward, = 0, 0, 0 done = False while not done: def makeState(to_process): prediction = generatePrediction(MODEL_TYPE, curr_model, to_process) return (prediction, to_process[1]) state = makeState(state_to_process) #print(state) def getMaxAction(state_for_action):
def trainning(): gamma = 0.99 batch_sz = 32 num_episodes = 1000000 total_t = 0 experience_replay_buffer = [] episode_rewards = [] last_100_avgs = [] max_eps = [-sys.maxsize] epsilon = 1.0 epsilon_min = 0.01 epsilon_change = (epsilon - epsilon_min) / 1000000 #500000 model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="model") target_model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="target_model") with tf.compat.v1.Session() as sess: model.set_session(sess) target_model.set_session(sess) sess.run(tf.compat.v1.global_variables_initializer()) model.load() print("Filling experience replay buffer...") state = env.reset() #for i in range(MIN_EXPERIENCES): for i in tqdm(range(MIN_EXPERIENCES)): action = np.random.randint(0, K) state, reward, done, _ = env.step(action) done = done == 1 #time.sleep(0.5) #print(obs) next_state = state experience_replay_buffer.append( (state, action, reward, next_state, done)) if done: state = env.reset() else: state = next_state try: i = 0 #for i in range(num_episodes): while True: i += 1 state = env.reset() loss = None num_steps_in_episode = 0 episode_reward = 0 done = False for _ in range(MAX_STEP): #while True: if total_t % TARGET_UPDATE_PERIOD == 0: target_model.copy_from(model) print( "Copied model parameters to target network, total_t = %s, period = %s" % (total_t, TARGET_UPDATE_PERIOD)) action = model.sample_action(state, epsilon) next_state, reward, done, _ = env.step(action) done = done == 1 episode_reward += reward if len(experience_replay_buffer) == MAX_EXPERIENCES: experience_replay_buffer.pop(0) experience_replay_buffer.append( (state, action, reward, next_state, done)) loss = learn(model, target_model, experience_replay_buffer, gamma, batch_sz) num_steps_in_episode += 1 state = next_state total_t += 1 epsilon = max(epsilon - epsilon_change, epsilon_min) if done: break #if not done: # episode_reward-=100 episode_rewards.append(episode_reward) #Reward every eps last_100_avg = np.array(episode_rewards[max(0, i - 100):i]).mean() last_100_avgs.append(last_100_avg) #Avg reward every eps max_eps.append(max(max_eps[-1], last_100_avg)) #Max eps print( "Episode: {:>6}, Num steps: {:>3}, Reward: {:>8.3f}, Avg reward: {:>5.3f}, Max: {:>5.3f} Eps: {:>5.3f}" .format(i, num_steps_in_episode, episode_reward, last_100_avg, max_eps[-1], epsilon)) if i % 100 == 0: model.save(i) sys.stdout.flush() if total_t > NUM_FRAME: break except: print("Break") finally: max_eps.pop(0) data = pd.DataFrame({ 'Reward': episode_rewards, 'Avg Reward': last_100_avgs, 'Max': max_eps }) data.to_csv("./data_result.csv") figure(num=None, figsize=(15, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot('Reward', '--', color="#999999", data=data, label="Reward") plt.plot('Avg Reward', data=data, label="Avg Reward") plt.plot('Max', data=data, label="Max") plt.legend(loc="upper left") plt.xlabel('episodes') #plt.show() plt.savefig('result.png')
cover_set.append(del_node) G.remove_node(del_node) print("***** Greedy:{} *****".format(len(cover_set))) if __name__ == "__main__": env = env(graph_size=50) num_eposides = 100000 n_step = 5 agent = Agent(num_nodes=env.graph_size) scores = [] for i in range(num_eposides): score = 0 num_nodes, mu, edge_index, edge_w, state, done = env.reset() state_steps = [state] reward_steps = [] action_steps = [] steps_cntr = 0 while not done[0]: action = agent.choose_action(mu, edge_index, edge_w, state) _, _, _, reward, new_state, done = env.step(action) state_steps.append(new_state) reward_steps.append(reward) action_steps.append(action) steps_cntr += 1