def run_episode(environment: gym.Env, agent: DQNAgent, render: bool, max_length: int): """ Run one episode in the given environment with the agent. Arguments: environment {`gym.Env`} -- Environment representing the Markov Decision Process agent {`DQNAgent`} -- Reinforcment Learning agent that acts in the envíronment render {`bool`} -- Whether the frames of the episode should be rendered on the screen max_length {`int`} -- Maximum number of steps before the episode is terminated Returns: `float` -- Cumulated reward that the agent received during the episode """ episode_reward = 0 state = environment.reset() for _ in range(max_length): if render: environment.render() action = agent.act(state) next_state, reward, terminal, _ = environment.step(action) agent.observe( Transition(state, action, reward, None if terminal else next_state)) episode_reward += reward if terminal: break else: state = next_state return episode_reward
def collect_stats(agent: DQNAgent, n_games=1000): MAX_STEPS = 1000 lenghts = [] looped = 0 for i in range(1, n_games+1): env = gym.make('snake-v0') # env.__init__(human_mode=False) observation = env.reset() done = False steps = 0 agent.epsilon = 0.0 state = agent.get_last_observations(observation) while not done and steps < MAX_STEPS: action = agent.act(state) next_observation, _, done, _ = env.step(action) state = agent.get_last_observations(next_observation) steps += 1 if steps == MAX_STEPS: looped += 1 else: lenghts.append(len(env.game.snake.body)) if i % (n_games//10) == 0: print(f"Avg len: {sum(lenghts) / len(lenghts):.2f}, looped {looped}/{i}")
def __init__(self, config): # Create session to store trained parameters self.session = tf.Session() self.action_count = config["action_count"] # Create agent for training self.agent = DQNAgent(self.action_count) # Create memory to store observations self.memory = ExperienceMemory(config["replay_memory_size"]) # Tools for saving and loading networks self.saver = tf.train.Saver() # Last action that agent performed self.last_action_index = None # Deque to keep track of average reward and play time self.game_history = GameHistory(config["match_memory_size"]) # Deque to store losses self.episode_history = EpisodeHistory(config["replay_memory_size"]) self.INITIAL_EPSILON = config["initial_epsilon"] self.FINAL_EPSILON = config["final_epsilon"] self.OBSERVE = config["observe_step_count"] self.EXPLORE = config["explore_step_count"] self.FRAME_PER_ACTION = config["frame_per_action"] self.GAMMA = config["gamma"] self.LOG_PERIOD = config["log_period"] self.BATCH_SIZE = config["batch_size"]
def __init__(self, host, port): self.state_size = 3 self.action_size = 7 self.done = False self.batch_size = 32 self.agent = DQNAgent(self.state_size, self.action_size) self.state_now = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.state_last = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.action_for_next = 0 self.action_for_now = 0 self.reward = 0 self.forward = "T394" self.left = "S450" self.right = "S270" self.backward = "T330" self.stop = "T370" self.middle = "S360" #dqn parameters self.server_socket = socket.socket() self.server_socket.bind((host, port)) self.server_socket.listen(0) self.connection, self.client_address = self.server_socket.accept() self.connection = self.connection.makefile("rb") self.host_name = socket.gethostname() self.host_ip = socket.gethostbyname(self.host_name) self.temp_result = None self.finnal_result = None self.RANGE = 350 self.WIDTH = 720 self.time_now = 0 self.count = 0 self.streaming()
def __init__(self, model_class, model=None, env=None, exploration=None, gamma=0.99, memory_size=10000, batch_size=64, target_update_frequency=10, saving_dir=None): """ base class for lstm dqn agent :param model_class: sub class of torch.nn.Module. class reference of the model :param model: initial model of the policy net. could be None if loading from checkpoint :param env: environment :param exploration: exploration object. Must have function value(step) which returns e :param gamma: gamma :param memory_size: size of the memory :param batch_size: size of the mini batch for one step update :param target_update_frequency: the frequency for updating target net (in episode) :param saving_dir: the directory for saving checkpoint """ DQNAgent.__init__(self, model_class, model, env, exploration, gamma, memory_size, batch_size, target_update_frequency, saving_dir) self.memory = EpisodicReplayMemory(memory_size) self.hidden_size = 0 if self.policy_net: self.hidden_size = self.policy_net.hidden_size self.hidden = None
def predict_dqn(self): # get size of state and action from environment state_size = 4 action_size = 2 agent = DQNAgent(state_size, action_size, load_model=True) done = False score = 0 self.reset() state, _, _, _ = self.step(-1) state = np.reshape(state, [1, state_size]) while not done: # get action for the current state and go one step in environment action = agent.get_action(state) next_state, reward, done, info = self.step(action) next_state = np.reshape(next_state, [1, state_size]) score += reward state = next_state if done or score >= 500: print("score:", score) break
def play_it(): #ENV_NAME = 'CartPole-v0' #ENV_NAME = 'MountainCar-v0' ENV_NAME = 'Single_virtual-v0' # Get the environment and extract the number of actions. env = make(ENV_NAME) env1 = make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n model = build_model(nb_actions,env.observation_space) # model = build_model1(nb_actions, env.observation_space) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy,) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights(os.path.join('models_weights_logs','dqn_{}_weights.h5f'.format(ENV_NAME+ datetime.now().strftime("%Y%m%d-%H%M%S"))), overwrite=True) # dqn.load_weights(os.path.join('models_weights_logs','dqn_{}_weights.h5f'.format(ENV_NAME))) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env1, nb_episodes=5, visualize=True)
class DQNScheduler: def __init__(self, simulator): self.agent = DQNAgent(25, 6) self.agent.load("./save/car-100-dqn.h5") self.simulator = simulator self.agent.epsilon = 0 def schedule(self): action = self.agent.act(np.reshape(self.simulator.get_state(), [1, 25])) return action
def _run_agent_one_ep(env: BaseEnv, agent: DQNAgent, config: Config, eps: float, behavior_name: str, train: Optional[bool] = True): # Get a starting state env.reset() decision_steps, terminal_steps = env.get_steps(behavior_name) state = decision_steps.obs[0] agent_id = decision_steps.agent_id[0] done = False did_win = False episode_reward = 0.0 import time while not done: reward = 0.0 # Get and perform an action action = agent.act(decision_steps.obs[0], eps) env.set_actions(behavior_name, np.expand_dims(action, 0).reshape(-1, 1)) env.step() decision_steps, terminal_steps = env.get_steps(behavior_name) # Determine S', R, Done next_state = None if agent_id in decision_steps: reward += decision_steps.reward[0] next_state = decision_steps.obs[0] if agent_id in terminal_steps: terminal_reward = terminal_steps.reward[0] # Add win/loss did_win = True if math.isclose(terminal_reward, 1.0) else False reward += terminal_reward next_state = terminal_steps.obs[0] done = True assert next_state is not None, f"next_state cannot be None. Agent {agent_id} did not appear in decision or terminal steps" if train: # Learn from (S, A, R, S') experience = Experience(state, action, reward, next_state, done) agent.step(experience) # Set new state state = next_state episode_reward += reward return (episode_reward, did_win)
def make_bot(un, pw, expected_opponent, team, challenge, trainer, epsilon=None, model_path=None, target_model_path=None ): if trainer: if model_path: agent = DQNAgent(INPUT_SHAPE, training=False) else: agent = RandomAgent() else: agent = DQNAgent( INPUT_SHAPE, epsilon=epsilon, random_moves=True, training=False, copy_target_model=False ) agent.load_model(model_path) if target_model_path != None: agent.target_model = load_model(target_model_path) else: agent.target_model.set_weights(agent.model.get_weights()) bot = BotClient( name=un, password=pw, expected_opponent=expected_opponent, team=team, challenge=challenge, runType=RunType.Iterations, runTypeData=1, agent=agent, trainer=trainer, save_model=False, should_write_replay=(not trainer) ) bot.start()
def get_agent(env, **kwargs): replay_capacity = 1e6 n_episodes = 10e7 return DQNAgent(env=env or gym.make('CartPole-v0'), n_episodes=n_episodes, replay_capacity=replay_capacity, **kwargs)
async def on_challenge_update(self, challenge_data): incoming = challenge_data.get('challengesFrom', {}) if self.expected_opponent.lower() in incoming: if self.trainer: model_paths = [ os.path.join(self.logs_dir, content) for content in os.listdir(self.logs_dir) if content.endswith('.model') and content.startswith('Epoch') ] if len(model_paths) > 0: sorted_model_paths = sorted( model_paths, key=lambda x: int( os.path.basename(x).lstrip('Epoch').rstrip('.model' ))) model_to_load = sorted_model_paths[-1] self.log(f'Loading model {model_to_load}') self.agent = DQNAgent(INPUT_SHAPE, training=False) self.agent.load_model(model_to_load) await self.accept_challenge(self.expected_opponent, self.team_text)
def __init__(self, player_name=None, letter=None): if player_name is None: self.player_name = common_utils.get_random_name() else: self.player_name = player_name if letter is not None: self.letter = letter else: pass # TODO: Handle this if letter == 'X': self.enemy_letter = 'O' else: self.enemy_letter = 'X' logger.debug("Initializing player {} with letter {} ...".format( self.player_name, self.letter)) self.agent = DQNAgent()
def watch_agent(agent: DQNAgent): env = gym.make('snake-v0') env.__init__(human_mode=True) observation = env.reset() renderer=Renderer(env.game) try: done = False steps = 0 agent.epsilon = 0 state = agent.get_last_observations(observation) while not done: # time.sleep(0.001) renderer.render_frame() action = agent.act(state) next_observation, _, done, _ = env.step(action) state = agent.get_last_observations(next_observation) steps += 1 finally: renderer.close_window() print(f"Snake length: {len(env.game.snake.body)}") print(f"Simulation ended after {steps} steps.")
def test_dqn(): args = DQNArgs() env = gym.make(args.env_name) agent = DQNAgent(env, QNet, SimpleNormalizer, args) agent.load(args.save_dir) for _ in range(10): agent.test_one_episode(True)
def simulateDQNControl(self, hdg0): ''' Plots the control law of the network over a simulation. :param hdg0: Initial heading of the boat for the simulation. :return: A plot of the angle of attack and velocity during the control. ''' agent = DQNAgent(self.mdp.size, self.action_size) agent.load(self.src) WH = self.wh.generateWind() hdg0 = hdg0 * TORAD * np.ones(self.wh.samples) state = self.mdp.initializeMDP(hdg0, WH) i = np.ones(0) v = np.ones(0) wind_heading = np.ones(0) for time in range(self.sim_time): WH = self.wh.generateWind() action = agent.actDeterministically(state) next_state, reward = self.mdp.transition(action, WH) state = next_state i = np.concatenate([i, self.mdp.extractSimulationData()[0, :]]) v = np.concatenate([v, self.mdp.extractSimulationData()[1, :]]) wind_heading = np.concatenate([wind_heading, WH[0:10]]) time_vec = np.linspace(0, self.sim_time, int((self.sim_time) / self.mdp.dt)) f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(time_vec, i / TORAD) axarr[1].plot(time_vec, v) axarr[0].set_ylabel("i [°]") axarr[1].set_ylabel("v [m/s]") axarr[0].set_xlabel("t [s]") axarr[1].set_xlabel("t [s]") plt.show()
def train_dqn(): args = DQNArgs() env = gym.make(args.env_name) agent = DQNAgent(env, QNet, SimpleNormalizer, args) pre_best = -1e9 for ep in range(args.max_ep): agent.train_one_episode() if ep % args.test_interval == 0: r = agent.test_model() if r > pre_best: pre_best = r agent.save(args.save_dir)
def simulateGustsControl(self): ''' Simulate the response of the controller to gusts. :return: A plot of the simulation. ''' self.sim_time = 100 agent = DQNAgent(self.mdp.size, self.action_size) agent.load(self.src) WH = self.wh.generateWind() hdg0 = 0 * TORAD * np.ones(self.wh.samples) state = self.mdp.initializeMDP(hdg0, WH) i = np.ones(0) v = np.ones(0) wind_heading = np.ones(0) for time in range(self.sim_time): WH = self.wh.generateWind() if time == 20: WH = self.wh.generateGust(10 * TORAD) action = agent.actDeterministically(state) next_state, reward = self.mdp.transition(action, WH) state = next_state i = np.concatenate([i, self.mdp.extractSimulationData()[0, :]]) v = np.concatenate([v, self.mdp.extractSimulationData()[1, :]]) wind_heading = np.concatenate([wind_heading, WH[0:10]]) time_vec = np.linspace(0, self.sim_time, int((self.sim_time) / self.mdp.dt)) f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(time_vec, i / TORAD) axarr[1].plot(time_vec, v) axarr[0].set_ylabel("angle of attack") axarr[1].set_ylabel("v") plt.show()
def main(): # parser = argparse.ArgumentParser(description='Run DQN on Atari SpaceInvaders') # parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') # parser.add_argument( # '-o', '--output', default='SpaceInvaders-v0', help='Directory to save data to') # parser.add_argument('--seed', default=0, type=int, help='Random seed') # # parser.add_argument('--input_shape', default=(84, 84, 4), type=tuple, help='Size of each frame') # # args = parser.parse_args() # # args.output = get_output_folder(args.output, args.env) #vehicle_network veh_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=g1) #Attacker network att_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=gym.make( args.env).action_space.n) veh_agent = DQNAgent(q_network=veh_network, preprocessor=core.Preprocessor(), memory=core.ReplayMemory(), policy=1, gamma=0.1, target_update_freq=100, num_burn_in=100, train_freq=20, batch_size=32) att_agent = DQNAgent(q_network=att_network, preprocessor=core.Preprocessor(), memory=core.ReplayMemory(), policy=1, gamma=0.1, target_update_freq=100, num_burn_in=100, train_freq=20, batch_size=32) veh_agent.compile('Adam', 'mse') att_agent.compile('Adam', 'mse') env = VehicleFollowingENV for i_episode in range(20): agent.fit(env, 10**6) # env.close() model_json = q_network.to_json() with open("model.json", "w") as json_file: json_file.write(model_json)
def build(args): # Params training = is_training(args) # Hack for switching number of DQN input features (see help) n_feats = {'all': 11, 'distance': 1} n_actions = 4 # we are ignoring action 0 (for now) # Maximum number of steps per episode max_steps = 8 * (args.dims[0] + args.dims[1]) - 1 # Total feature dimension total_feats = n_feats[args.feats] * sum( [4**i for i in range(args.n_nodes + 1)]) # Flatland Environment environment = FlatlandEnv(x_dim=args.dims[0], y_dim=args.dims[1], n_cars=args.n_agents, n_acts=n_actions, min_obs=-1.0, max_obs=1.0, n_nodes=args.n_nodes, feats=args.feats) # Simple DQN agent agent = DQNAgent(alpha=0.0005, gamma=0.99, epsilon=1.0, input_shape=total_feats, sample_size=512, batch_size=32, n_actions=n_actions, training=training) if not training: agent.load_model() return environment, agent, max_steps
def main(): train_data, parameter[1]["episode_length"] = data_prepare(parameter) parameter[2]['action_size'], parameter[2][ 'state_size'], state, env = create_states(parameter, train_data) #create model agent = DQNAgent(parameter) #train model and save train(agent, parameter, state, env) caculation(agent, env) #test model parameter[0]["mode"] = 'test' test_data = data_prepare(parameter)[0] test_env = create_states(parameter, test_data)[3] caculation(agent, test_env)
def load_model(MODEL_TYPE): curr_model = None if MODEL_TYPE == "SVM": print("LOADING SVM...") curr_model = load("svm.joblib") elif MODEL_TYPE == "LR": print("LOADING LR...") lr = LogReg(74) #(env.matches.shape[1]) lr.load_weights("weights/weights-improvement-100-0.31.hdf5") curr_model = lr elif MODEL_TYPE == "DT": print("LOADING DT...") curr_model = load("dt.joblib") elif MODEL_TYPE == "GB": print("LOADING GB...") curr_model = load("gb.joblib") elif MODEL_TYPE == "RF": print("LOADING RF...") curr_model = load("rfc.joblib") elif MODEL_TYPE == "NB": print("LOADING NB...") curr_model = load("nb.joblib") elif MODEL_TYPE == "AB": print("LOADING AB...") curr_model = load("ab.joblib") elif MODEL_TYPE == "DQN": print("LOADING DQN...") BetNet = DQNAgent(75) BetNet.load("weights/betnet-weights-dqn.h5") curr_model = BetNet else: print("LOADING NN...") BetNet = Network(74) #(env.matches.shape[1]) BetNet.load_weights( 'weights/Adadelta/test9_400_Best/weights-improvement-400-0.48.hdf5' ) #PCA("weights/Adadelta/test13_100iter_reglast2/weights-improvement-100-0.52.hdf5") # Most recent weights curr_model = BetNet return curr_model
def main(): # vehicle_network veh_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=4) # Attacker network # att_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=4) veh_agent = DQNAgent(q_network=veh_network, q_network2=veh_network, preprocessor=core.Preprocessor(), RLmemory=core.ReplayMemory(), SLmemory=core.ReplayMemory(), policy=1, gamma=0.1, target_update_freq=100, num_burn_in=100, train_freq=20, batch_size=32) # att_agent = DQNAgent(q_network=att_network, # q_network2=att_network, # preprocessor=core.Preprocessor(), # memory=core.ReplayMemory(), # policy=1, # gamma=0.1, # target_update_freq=100, # num_burn_in=100, # train_freq=20, # batch_size=32) veh_agent.compile('Adam', 'mse') # att_agent.compile('Adam', 'mse') env = VehicleFollowingENV() for i_episode in range(20): veh_agent.fit(env=env, num_iterations=10 ** 6) # att_agent.fit(env, 10 ** 6) # env.close() model_json = veh_network.to_json() with open("model.json", "w") as json_file: json_file.write(model_json)
debug_log(f'file content with syntax error\n{s}') debug_log('') for i in range(5): try: os.remove(file_path) break except PermissionError: debug_log('Permission error when removing the file') time.sleep(1) #NOTE: train #NOTE: create/load DQN and target DQN in main thread keras.backend.clear_session() agent = DQNAgent(INPUT_SHAPE, training=True, replay_memory=minibatch, copy_target_model=False ) agent.target_model = load_model(target_model_path) #NOTE: train newly loaded model on new data if len(minibatch) > 0: minibatch_history = agent.train_only(len(minibatch), len(minibatch)) if minibatch_history == None: debug_log('ERROR: Unable to train on iteration\'s data') replay_memory.extend(minibatch) else: debug_log('WARNING: Skipping minibatch training since no new data was found') #NOTE: train newly loaded model on random selection of old data agent.replay_memory = replay_memory sum_loss = 0 if len(replay_memory) > MIN_REPLAY_MEMORY_SIZE:
# Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, window_length=WINDOW_LENGTH, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, delta_range=(-1., 1.), target_model_update=10000, train_interval=4) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True)
def main(argv): args = parser.parse_args(argv[1:]) if args.usage == 'help': return parser.print_help() if is_environments_gen(args): _write_env_file(args) elif is_environments_list(args): all_registry = registry.all() registry_envs_name = [ trim_env_spec_name(env.__repr__()) for env in all_registry ] for environment in registry_envs_name: print(environment) elif is_environments_act(args): env = gym.make(args.environment_name) if is_action_type('dqn', args): if args.pre_defined_state_size == 'nesgym': pre_state_size = 172032 elif args.pre_defined_state_size == 'gym': pre_state_size = env.observation_space.shape[0] elif args.pre_defined_state_size == 'gym-atari': pre_state_size = 100800 elif args.pre_defined_state_size == 'gym-atari-extend': pre_state_size = 120000 elif args.pre_defined_state_size == 'gym-atari-small': pre_state_size = 100800 elif args.pre_defined_state_size == 'gym-gomoku': pre_state_size = 361 # state_size = (1,) + env.observation_space.shape state_size = pre_state_size action_size = env.action_space.n agent = DQNAgent(state_size, action_size) # try: # agent.load('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps, # args.i_episodes)) # except Exception: # pass done = False batch_size = 64 i_episodes = args.i_episodes timesteps = args.timesteps factor = args.seed_factor for i_episode in range(i_episodes): state = env.reset() if is_action_type('dqn', args): state = np.reshape(state, [1, pre_state_size]) for t in range(timesteps): try: if args.render == 'present': env.render() if args.render == 'presented': env.render(args.render) if args.action_type == 'alternate': action_choice = i_episodes * 2 action = random_action_space_sample_choice( action_choice, env, factor) elif args.action_type == 'specific': action = env.action_space.sample() elif args.action_type == 'conditional': action_choice = i_episodes action = random_action_space_sample_choice( action_choice, env, factor) elif args.action_type == 'numerical': action = env.action_space.n elif is_action_type('dqn', args) and len(state) == 5: action = agent.act(state) elif is_action_type('dqn', args) and len(state) != 5: action = env.action_space.sample() collect_stat(action, ['input', 'actions'], stats) observation, reward, done, info = env.step(action) if is_action_type('dqn', args): reward = reward if not done else -10 observation = np.reshape(observation, [1, pre_state_size]) agent.remember(state, action, reward, observation, done) state = observation # collect_stat(observation,['observation'],stats) collect_stat(reward, ['rewards'], stats) # collect_stat(done,['output','done'],stats) # collect_stat(info,['output','info'],stats) if done: max_episodes_range = (i_episodes - 1) episode_timesteps_iteration_limit = max_episodes_range - 1 is_latest_episode = is_filled_latest_episode_with_iteration( i_episode, episode_timesteps_iteration_limit) increased_timestep = increase_timestep(t) print('i_episode {}'.format(i_episode)) print('Episode finished after {} timesteps'.format( increased_timestep)) if is_action_type('dqn', args): print('Episode: {}/{}, score: {}, e: {:.2}'.format( i_episode, i_episodes, t, agent.epsilon)) collect_stat(t, ['output', 'timestep', 'iteration'], stats) collect_stat(increased_timestep, ['output', 'timestep', 'increased'], stats) is_latest_episode_to_save_state = lambda args_cached: is_latest_episode and args_cached.output_stats_filename if is_latest_episode_to_save_state(args): filename = args.output_stats_filename pre_df = { # 'observations': stats['observations'], 'rewards': stats['rewards'], # 'done-output': stats['output']['done'], # 'info-output': stats['output']['info'], # 'iteration-timestep': stats['output']['timestep']['iteration'], # 'increased-timestep': stats['output']['timestep']['increased'], 'actions-input': stats['input']['actions'] } df = pd.DataFrame(pre_df) stamp = lambda: '%s' % (int(datetime.now(). timestamp())) with open( 'data/{}-{}.csv'.format(stamp(), filename), 'w') as f: f.write(df.to_csv()) f.close() print('Statistics file saved ({}.csv)!'.format( filename)) del df del filename print(check_output_env_label()) del is_latest_episode_to_save_state del increased_timestep del is_latest_episode del episode_timesteps_iteration_limit del max_episodes_range break except Exception as e: print('Rendering execution ({})'.format(e)) finally: print('Execution of timestep done') if is_action_type('dqn', args) and (len(agent.memory) > batch_size): agent.replay(batch_size) # agent.save('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps, # args.i_episodes)) # env.close() else: parser.print_help()
def run(self): ### create TORCS environment env = TorcsEnv(vision=False, throttle=True) ### start run according to supplied arguments if self.algorithm == "dqn" and self.modus == "train": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "dqn" and self.modus == "test": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.testAgent() elif self.algorithm == "ddpg" and self.modus == "train": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "ddpg" and self.modus == "test": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.testAgent()
def main(): print "Creating DQN agent..." # env = gym.make("codegen-v0") set_debugger_org_frc() iters = 6300 n_goal = 0 n_goal_all = 0 time_stamp = 0 max_steps = 5 agent = DQNAgent(max_steps) agent.dqn.initial_exploration = 6000 * max_steps for iter in range(iters): print "\n********Iteration # ", iter, "***********\n" # 1 iteration env = gym.make("codegen-v0") num = random.randrange(1, 100) print "Goal Number : ", num + 1 env.my_input = num #env.goal = "['" + env.my_input + "']" env.goal = str(num + 1) code = env._reset() step_in_episode = 0 total_score = 0.0 reward = 0.0 mystate = [] my_state_new = [] # debug : the sys # sss = [] # for arg in sys.argv[1:]: # sss.append(arg) # print "sss = " , sss # while True: while step_in_episode < max_steps: # state = env.code_index_list + [-1]*(max_steps-len(env.code_index_list state = env.code_index_list[:] state += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() # state = state.tolist() # state = 1; # print "env = ",env.code_index_list # print "state = ",state # raw_input() if step_in_episode == 0: action_idx = agent.start(code, state) else: action_idx = agent.act(code, state, reward) code, reward, terminal, info = env._step(action_idx, agent.dqn.actions) state_prime = env.code_index_list[:] state_prime += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() # debug : the sys # sss = [] # for arg in sys.argv[1:]: # sss.append(arg) # print "sss = " , sss print "state : " print state print "state' : " print state_prime if step_in_episode == max_steps - 1: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 1) else: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 0) agent.dqn.experience_replay(agent.dqn.time_stamp) agent.dqn.target_model_update(agent.dqn.time_stamp, soft_update=False) total_score += reward if terminal: agent.dqn.goal_idx.append(agent.dqn.time_stamp) agent.end(reward) agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 1) n_goal_all += 1 step_in_episode += 1 agent.dqn.time_stamp += 1 if iters - iter <= 100: n_goal += 1 break step_in_episode += 1 agent.dqn.time_stamp += 1 if iter == 1 + (agent.dqn.initial_exploration / max_steps): print "n_goal_all = ", n_goal_all print agent.dqn.goal_idx raw_input() print "n_goal : ", n_goal print "epsilon : ", agent.epsilon
class VideoStreamingTest(object): def __init__(self, host, port): self.state_size = 3 self.action_size = 7 self.done = False self.batch_size = 32 self.agent = DQNAgent(self.state_size, self.action_size) self.state_now = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.state_last = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.action_for_next = 0 self.action_for_now = 0 self.reward = 0 self.forward = "T394" self.left = "S450" self.right = "S270" self.backward = "T330" self.stop = "T370" self.middle = "S360" #dqn parameters self.server_socket = socket.socket() self.server_socket.bind((host, port)) self.server_socket.listen(0) self.connection, self.client_address = self.server_socket.accept() self.connection = self.connection.makefile("rb") self.host_name = socket.gethostname() self.host_ip = socket.gethostbyname(self.host_name) self.temp_result = None self.finnal_result = None self.RANGE = 350 self.WIDTH = 720 self.time_now = 0 self.count = 0 self.streaming() def dqn_loop(self): if self.finnal_result['me']['r'] > 1: self.done = True else: self.done = False if True: self.prepare_state() #更新前一次状态,并获取这一次状态 self.prepare_action() #更新前一次动作,并获取本次操作 if self.count == 1: self.prepare_reward() #获取上一次活动的奖励 else: self.count += 1 self.act_move() #更新小车运动状态 if self.count == 1: self.remember_step() #收集本次数据 if len(self.agent.memory) > self.batch_size: self.agent.replay(self.batch_size) def prepare_state(self): self.state_last = self.state_now state_now_ = [self.finnal_result['me']['alpha_big'], \ self.finnal_result['me']['alpha_small'], \ self.finnal_result['me']['r']] self.state_now = np.reshape(state_now_, [1, self.state_size]) #self.state_now = state_now_ def prepare_action(self): self.action_for_now = self.action_for_next self.action_for_next = self.agent.act(self.state_now) def prepare_reward(self): #运行条件:state_last非空 if self.done: self.reward = -10 else: self.reward = (self.state_last[0][2] - self.state_now[0][2]) * 100 #self.reward = (self.state_last[2] - self.state_now[2])*100 def remember_step(self): self.agent.remember(self.state_last, self.action_for_now, self.reward, self.state_now, self.done) def act_move(self): if self.done: self.action_for_next = 0 if self.action_for_next == 0: #停止 str_S = self.middle str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 1: #前进 str_S = self.middle str_T = self.forward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 2: #左转前进 str_S = self.left str_T = self.forward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 3: #右转前进 str_S = self.right str_T = self.forward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 4: #后退 str_S = self.middle str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.middle str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.middle str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 5: #左转后退 str_S = self.left str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.left str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") str_S = self.left str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") elif self.action_for_next == 6: #右转后退 str_S = self.right str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.right str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.right str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) def get_one_car(self, x1, y1, x2, y2): x0 = (x1 + x2) / 2 y0 = (y1 + y2) / 2 detx = x1 - x2 dety = y1 - y2 temp_x0 = x0 - self.WIDTH / 2 temp_y0 = y0 - self.WIDTH / 2 if detx > 0: alpha_small = math.atan(dety / detx) elif detx < 0: alpha_small = math.atan(dety / detx) + math.pi else: if dety > 0: alpha_small = math.pi / 2 else: alpha_small = 0 - math.pi / 2 if temp_x0 > 0: alpha_big = math.atan(temp_y0 / temp_x0) elif temp_x0 < 0: alpha_big = math.atan(temp_y0 / temp_x0) + math.pi else: if temp_y0 > 0: alpha_big = math.pi / 2 else: alpha_big = 0 - math.pi / 2 alpha_small = alpha_small / math.pi - 0.5 alpha_big = alpha_big / math.pi - 0.5 r = math.sqrt(temp_x0**2 + temp_y0**2) / self.RANGE return { "alpha_big": alpha_big, "alpha_small": alpha_small, "r": r, "x0": x0, "y0": y0 } def get_finnal_result(self): red_x = self.temp_result["red"]["x"] red_y = self.temp_result["red"]["y"] green_x = self.temp_result["green"]["x"] green_y = self.temp_result["green"]["y"] blue_x = self.temp_result["blue"]["x"] blue_y = self.temp_result["blue"]["y"] yellow_x = self.temp_result["yellow"]["x"] yellow_y = self.temp_result["yellow"]["y"] finnal_temp = {} me_temp = self.get_one_car(red_x, red_y, green_x, green_y) enemy_temp = self.get_one_car(blue_x, blue_y, yellow_x, yellow_y) finnal_temp["me"] = me_temp finnal_temp["enemy"] = enemy_temp self.finnal_result = finnal_temp def draw(self, frame, lowerRGB, upperRGB, word): hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) # 根据阈值构建掩膜 mask = cv2.inRange(hsv, lowerRGB, upperRGB) # 腐蚀操作 mask = cv2.erode(mask, None, iterations=2) # 膨胀操作,其实先腐蚀再膨胀的效果是开运算,去除噪点 mask = cv2.dilate(mask, None, iterations=2) cnts = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[-2] # 初始化瓶盖圆形轮廓质心 center = None # 如果存在轮廓 if len(cnts) > 0: # 找到面积最大的轮廓 c = max(cnts, key=cv2.contourArea) # 确定面积最大的轮廓的外接圆 ((x, y), radius) = cv2.minEnclosingCircle(c) # 计算轮廓的矩 M = cv2.moments(c) # 计算质心 center = (int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"])) # 只有当半径大于10时,才执行画图 if radius > 10: cv2.circle(frame, (int(x), int(y)), int(radius), (0, 255, 255), 2) cv2.circle(frame, center, 5, (0, 0, 255), -1) font = cv2.FONT_HERSHEY_SIMPLEX cv2.putText(frame, word, (int(x), int(y)), font, 1.2, (255, 255, 255), 2) result = {} result["x"] = x result["y"] = y return result def streaming(self): try: print("Host: ", self.host_name + " " + self.host_ip) print("Connection from: ", self.client_address) print("Streaming...") print("Press 'q' to exit") redLower = np.array([170, 100, 200]) redUpper = np.array([179, 255, 255]) greenLower = np.array([65, 100, 100]) greenUpper = np.array([85, 255, 255]) #blueLower = np.array([0, 0, 150]) #blueUpper = np.array([100, 100, 255]) blueLower = np.array([95, 100, 100]) blueUpper = np.array([115, 255, 255]) yellowLower = np.array([5, 100, 100]) yellowUpper = np.array([20, 255, 255]) # need bytes here stream_bytes = b" " while True: stream_bytes += self.connection.read(1024) first = stream_bytes.find(b"\xff\xd8") last = stream_bytes.find(b"\xff\xd9") #str_ = 'S270' #str_ = str_.encode("utf-8") #socket_tcp.send(str_) #f = open('record_' + str(self.count) + '.json', 'w') #json.dump(dic_dump, f) #f.close() if first != -1 and last != -1: jpg = stream_bytes[first:last + 2] stream_bytes = stream_bytes[last + 2:] image = cv2.imdecode(np.frombuffer(jpg, dtype=np.uint8), cv2.IMREAD_COLOR) frame = image result_red = self.draw(frame, redLower, redUpper, "RED") result_green = self.draw(frame, greenLower, greenUpper, "GREEN") result_blue = self.draw(frame, blueLower, blueUpper, "blue") result_yellow = self.draw(frame, yellowLower, yellowUpper, "YELLOW") result = {} result["red"] = result_red result["green"] = result_green result["blue"] = result_blue result["yellow"] = result_yellow self.temp_result = result flag = True if not result_red: flag = False if not result_green: flag = False if not result_blue: flag = False if not result_yellow: flag = False if flag: self.get_finnal_result() self.time_now = int((time.time() - start_time) * 1000) self.dqn_loop() ''' dic_dump = {'data': self.finnal_result, 'time' : self.time_now} f = open('./test_1/record_' + str(self.count) + '.json', 'w') json.dump(dic_dump, f) f.close() self.count +=1 ''' cv2.line(frame, (int(self.temp_result["red"]["x"]), int(self.temp_result["red"]["y"])), (int(self.temp_result["green"]["x"]), int(self.temp_result["green"]["y"])), (0, 255, 0), 1, 4) cv2.line(frame, (int(self.temp_result["blue"]["x"]), int(self.temp_result["blue"]["y"])), (int(self.temp_result["yellow"]["x"]), int(self.temp_result["yellow"]["y"])), (0, 255, 0), 1, 4) cv2.line(frame, (int(self.finnal_result["me"]["x0"]), int(self.finnal_result["me"]["y0"])), (int(self.WIDTH / 2), int(self.WIDTH / 2)), (0, 0, 255), 4, 4) cv2.line(frame, (int(self.finnal_result["enemy"]["x0"]), int(self.finnal_result["enemy"]["y0"])), (int(self.WIDTH / 2), int(self.WIDTH / 2)), (255, 0, 0), 4, 4) font = cv2.FONT_HERSHEY_SIMPLEX cv2.putText(frame, str(self.finnal_result["me"]["alpha_big"]), (int(self.finnal_result["me"]["x0"]), int(self.finnal_result["me"]["y0"])), font, 1, (0, 255, 0), 1) cv2.putText( frame, str(self.finnal_result["enemy"]["alpha_small"]), (int(self.finnal_result["enemy"]["x0"]), int(self.finnal_result["enemy"]["y0"])), font, 1, (0, 255, 0), 1) else: str_S = "S360" str_T = "T370" str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) #print(self.finnal_result) cv2.imshow("Frame", frame) if cv2.waitKey(1) & 0xFF == ord("q"): break finally: self.connection.close() self.server_socket.close()
curr_model = load("dt.joblib") elif MODEL_TYPE == "GB": print("LOADING GB...") curr_model = load("gb.joblib") elif MODEL_TYPE == "RF": print("LOADING RF...") curr_model = load("rfc.joblib") elif MODEL_TYPE == "NB": print("LOADING NB...") curr_model = load("nb.joblib") elif MODEL_TYPE == "AB": print("LOADING AB...") curr_model = load("ab.joblib") elif MODEL_TYPE == "DQN": print("LOADING DQN...") BetNet = DQNAgent(75) BetNet.load("weights/betnet-weights-dqn.h5") curr_model = BetNet else: print("LOADING NN...") BetNet = Network(env.matches.shape[1]) BetNet.load_weights( "weights/Adadelta/test13_100iter_reglast2/weights-improvement-100-0.52.hdf5" ) # Most recent weights curr_model = BetNet ############################################################################### #GETS THE PREDICTION VEC GIVEN MODEL def generatePrediction(mt, curr_model, to_process):
from mdp import MDP import random ''' MDP Parameters ''' mdp = MDP(duration_history=3, duration_simulation=1, delta_t=0.1) ''' Environment Parameters ''' w = wind(mean=45 * TORAD, std=0 * TORAD, samples=10) WH = w.generateWind() hdg0 = 0 * np.ones(10) mdp.initializeMDP(hdg0, WH) agent = DQNAgent(mdp.size, action_size=2) #agent.load("../Networks/lighter_archi") batch_size = 50 EPISODES = 500 hdg0_rand_vec = [-3, 0, 3, 6, 9, 12, 15, 18, 21] loss_of_episode = [] i = [] v = [] r = [] for e in range(EPISODES): WH = w.generateWind() hdg0_rand = random.choice(hdg0_rand_vec) * TORAD hdg0 = hdg0_rand * np.ones(10)
model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True)