def main(desired_iterations, save_path): # Define a log file to use with tensorboard # Not that we currently make use of tensorboard at all LOG_DIR = tempfile.mkdtemp() print "Tensorboard Log: " + LOG_DIR + '\n' # The directory to save the animations to SAVE_DIR = save_path # Define the simulation sim = Planning(get_noodle_environment()) # Tensorflow! tf.reset_default_graph() session = tf.InteractiveSession() journalist = tf.train.SummaryWriter(LOG_DIR) brain = MLP([ sim.observation_size, ], [200, 200, sim.num_actions], [tf.tanh, tf.tanh, tf.identity]) optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9) # DiscreteDeepQ object current_controller = DiscreteDeepQ(sim.observation_size, sim.num_actions, brain, optimizer, session, random_action_probability=0.2, discount_rate=0.9, exploration_period=1000, max_experience=10000, store_every_nth=1, train_every_nth=1, summary_writer=journalist) # Initialize the session session.run(tf.initialize_all_variables()) session.run(current_controller.target_network_update) # journalist.add_graph(session.graph) # Run the simulation and let the robot learn num_simulations = 0 iterations_needed = [] total_rewards = [] try: for game_idx in range(desired_iterations + 1): current_random_prob = current_controller.random_action_probability update_random_prob = game_idx != 0 and game_idx % 200 == 0 if update_random_prob and 0.01 < current_random_prob <= 0.1: current_controller.random_action_probability = current_random_prob - 0.01 elif update_random_prob and 0.1 < current_random_prob: current_controller.random_action_probability = current_random_prob - 0.1 game = Planning(get_noodle_environment()) game_iterations = 0 observation = game.observe() while not game.is_over(): action = current_controller.action(observation) reward = game.collect_reward(action) new_observation = game.observe() current_controller.store(observation, action, reward, new_observation) current_controller.training_step() observation = new_observation game_iterations += 1 total_rewards.append(sum(game.collected_rewards)) iterations_needed.append(game_iterations) rewards = [] if game_idx % 50 == 0: print "\rGame %d:\nIterations before end: %d." % ( game_idx, game_iterations) if game.collected_rewards[-1] == 10: print "Hit target!" print "Total Rewards: %s\n" % (sum(game.collected_rewards)) if SAVE_DIR is not None: game.save_path(SAVE_DIR, game_idx) except KeyboardInterrupt: print "Interrupted" # Plot the iterations and reward plt.figure(figsize=(12, 8)) plt.plot(total_rewards, label='Reward') # plt.plot(iterations_needed, label='Iterations') plt.legend() plt.show()
performances = [] try: for game_idx in range(2000): game = DiscreteHill() game_iterations = 0 observation = game.observe() while game_iterations < 50 and not game.is_over(): action = current_controller.action(observation) reward = game.collect_reward(action) game.perform_action(action) new_observation = game.observe() current_controller.store(observation, action, reward, new_observation) current_controller.training_step() observation = new_observation game_iterations += 1 performance = float(game_iterations - (game.shortest_path)) / game.shortest_path performances.append(performance) if game_idx % 100 == 0: print "\rGame %d: iterations before success %d." % (game_idx, game_iterations), print "Pos: %s, Target: %s" % (game.position, game.target), except KeyboardInterrupt: print "Interrupted" # In[11]: N = 500
game_iterations = 0 observation = game.observe() prev_frames = [(observation, -1)] * (n_prev_frames - 1) memory = np.concatenate([np.concatenate([observation, np.array([-1])])] * (n_prev_frames - 1) + [observation]) while game_iterations < 50 and not game.is_over(): action = current_controller.action(memory) if n_prev_frames > 1: prev_frames = prev_frames[1:] + [(observation, action)] reward = game.collect_reward(action) game.perform_action(action) observation = game.observe() new_memory = np.concatenate([np.concatenate([a, np.array([b])]) for (a, b) in prev_frames] + [observation]) current_controller.store(memory, action, reward, new_memory) current_controller.training_step() memory = new_memory game_iterations += 1 cost = abs(game.target[0]) + abs(game.target[1]) performances.append((game_iterations - cost) / float(cost)) if game_idx % 100 == 0: print "\rGame %d: iterations before success %d." % (game_idx, game_iterations), print "Pos: %s, Target: %s" % (game.position, game.target), except KeyboardInterrupt: print "Interrupted" # In[327]: N = 500
observation = game.observe() x0 = copy.deepcopy(observation) rewards = [] cost0 = game.cost() path = [copy.deepcopy(observation)] while game_iterations < 100 and not game.is_over(): action = current_controller.action(observation) game.perform_action(action) game.step(dt) cost1 = game.cost() reward = cost0 - cost1 - 2 # reward = -reward rewards.append(reward) new_observation = game.observe() current_controller.store(observation, action, reward, new_observation) current_controller.training_step() observation = new_observation cost0 = cost1 game_iterations += 1 path.append(copy.deepcopy(observation)) sio.savemat( '/home/fantaosha/Documents/tensorflow-deepq/results/quadrotor_path/quadrotor_' + str(game_idx) + '.mat', {'path': np.array(path)}) performance = np.sum(rewards) performances.append(performance)
prev_frames = [(observation, -1)] * (n_prev_frames - 1) memory = np.concatenate( [np.concatenate([observation, np.array([-1])])] * (n_prev_frames - 1) + [observation]) while game_iterations < 50 and not game.is_over(): action = current_controller.action(memory) if n_prev_frames > 1: prev_frames = prev_frames[1:] + [(observation, action)] reward = game.collect_reward(action) game.perform_action(action) observation = game.observe() new_memory = np.concatenate( [np.concatenate([a, np.array([b])]) for (a, b) in prev_frames] + [observation]) current_controller.store(memory, action, reward, new_memory) current_controller.training_step() memory = new_memory game_iterations += 1 cost = abs(game.target[0]) + abs(game.target[1]) performances.append((game_iterations - cost) / float(cost)) if game_idx % 100 == 0: print "\rGame %d: iterations before success %d." % ( game_idx, game_iterations), print "Pos: %s, Target: %s" % (game.position, game.target), except KeyboardInterrupt: print "Interrupted" # In[327]: N = 500