def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_replay: self.mem.load(args.load_replay) if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.save_interval = args.save_interval self.save_replay = args.save_replay self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.save_csv = args.save_csv if self.save_csv: self.csv_file = open(args.save_csv, "wb") self.csv_writer = csv.writer(self.csv_file) self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps']) self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.skip = args.skip self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.distances = [] self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds)
def main(game, episodes, training_mode=False, log=False, no_ops=30): env = gym.make(game) num_actions = env.action_space.n dqn = DeepQNetwork(num_actions, (4, 84, 84)) replay = ReplayMemory(100000) obs = env.reset() h, w, c = obs.shape phi = Phi(4, 84, 84, c, h, w) agent = Agent(replay, dqn, training_mode=training_mode) stats = Stats('results/results.csv') for i_episode in range(episodes): env.reset() for i in range(random.randint(1, no_ops)): observation, _, _, _ = env.step(0) pre_state = phi.add(observation) game_score = 0 done = False t = 0 while not done: t += 1 env.render() action = agent.get_action(pre_state) observation, reward, done, _ = env.step(action) post_state = phi.add(observation) if training_mode: agent.update_replay_memory(pre_state, action, reward, post_state, done) if agent.time_step > agent.replay_start_size: stats.log_time_step(agent.get_loss()) pre_state = post_state game_score += reward print("Episode {} finished after {} time steps with score {}".format( i_episode, t, game_score)) phi.reset() if agent.time_step > agent.replay_start_size: stats.log_game(game_score, t) stats.close() if log: dqn.save_model('results/model_weights.hdf5')
def create_function_nn(sim_config): function_config = dict() function_path = sim_config['Function_path'] path = os.path.join(os.path.dirname(os.path.abspath(__file__)), function_path) with open(path + '/agent_config.csv', 'r') as f: reader = csv.reader(f) for row in reader: funcion_config[row[0]] = row[1] func_config['min_epsilon'] = 0 func_config['epsilon_decaying_states'] = 0 func_config['min_D_size'] = 0 input_data = function_config['input_data'] num_history = int(function_config['agent_history']) num_peer = int(sim_config['num_peer']) if input_data == 'upload' or input_data == 'download': width = num_history height = num_peer - 1 elif input_data == 'upload_and_download': width = num_history height = (num_peer - 1) * 2 func_config['width'] = width func_config['height'] = height neural_network = DeepQNetwork(width, height, int(func_config['fc2_outputs']), function_config) sess = tf.InteractiveSession() sess.run(tf.global_variabels_initializer()) neural_network.restore_parameters( sess, os.path.join( path, 'model', 'train_network', 'train_network-' + str(sim_config['Function_restore_checkpoint']))) return neural_network, sess, func_config
antarg.add_argument("--exploration_decay_steps", type=float, default=1000000, help="How many steps to decay the exploration rate.") antarg.add_argument("--exploration_rate_test", type=float, default=0.05, help="Exploration rate used during testing.") antarg.add_argument("--train_frequency", type=int, default=4, help="Perform training after this many game steps.") antarg.add_argument("--train_repeat", type=int, default=1, help="Number of times to sample minibatch during training.") antarg.add_argument("--random_starts", type=int, default=30, help="Perform max this number of dummy actions after game restart, to produce more random game dynamics.") mainarg = parser.add_argument_group('Main loop') mainarg.add_argument("--load_weights", help="Load network from file.") mainarg.add_argument("--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.") comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=100, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) mem = None agent = Agent(env, mem, net, args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) agent.play(args.num_episodes) env.gym.monitor.close()
random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': # logger does not work with this line #logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games) # Set env mode test so that loss of life is not considered as terminal env.setMode('test') stats.reset() agent.play(args.play_games, args) stats.write(0, "play") if args.visualization_file:
if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games) stats.reset() agent.play(args.play_games) stats.write(0, "play") if args.visualization_file: from visualization import visualize # use states recorded during gameplay. NB! Check buffer size, that it can accomodate one game!
mainarg.add_argument( "--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.") comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=100, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) mem = None agent = Agent(env, mem, net, args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) agent.play(args.num_episodes) env.gym.monitor.close()
mainarg = parser.add_argument_group('Main loop') mainarg.add_argument("--load_weights", help="Load network from file.") mainarg.add_argument("--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.") comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=10, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) buf = MemoryBuffer(args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) avg_reward = 0 num_episodes = args.num_episodes for i_episode in xrange(num_episodes): env.restart() observation = env.getScreen() buf.reset() i_total_reward = 0 for t in xrange(10000):
comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=10, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) buf = MemoryBuffer(args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) avg_reward = 0 num_episodes = args.num_episodes for i_episode in xrange(num_episodes): env.restart() observation = env.getScreen() buf.reset() i_total_reward = 0 for t in xrange(10000):
class Driver(object): ''' A driver object for the SCRC ''' def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_replay: self.mem.load(args.load_replay) if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.save_interval = args.save_interval self.save_replay = args.save_replay self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.save_csv = args.save_csv if self.save_csv: self.csv_file = open(args.save_csv, "wb") self.csv_writer = csv.writer(self.csv_file) self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps']) self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.skip = args.skip self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.distances = [] self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) def init(self): '''Return init string with rangefinder angles''' self.angles = [0 for x in range(19)] for i in range(5): self.angles[i] = -90 + i * 15 self.angles[18 - i] = 90 - i * 15 for i in range(5, 9): self.angles[i] = -20 + (i-5) * 5 self.angles[18 - i] = 20 - (i-5) * 5 return self.parser.stringify({'init': self.angles}) def getState(self): #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()]) #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0 state = np.array(self.state.getTrack()) / 200.0 assert state.shape == (self.num_inputs,) return state def getReward(self, terminal): if terminal: reward = -1000 else: dist = self.state.getDistFromStart() if self.prev_dist is not None: reward = max(0, dist - self.prev_dist) * 10 assert reward >= 0, "reward: %f" % reward else: reward = 0 self.prev_dist = dist #reward -= self.state.getTrackPos() #print "reward:", reward return reward def getTerminal(self): return np.all(np.array(self.state.getTrack()) == -1) def getEpsilon(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def drive(self, msg): # parse incoming message self.state.setFromMsg(msg) # show sensors if self.show_sensors: self.stats.update(self.state) # training if self.enable_training and self.mem.count >= self.minibatch_size: minibatch = self.mem.getMinibatch() self.net.train(minibatch) self.total_train_steps += 1 #print "total_train_steps:", self.total_train_steps # skip frame and use the same action as previously if self.skip > 0: self.frame = (self.frame + 1) % self.skip if self.frame != 0: return self.control.toMsg() # fetch state, calculate reward and terminal indicator state = self.getState() terminal = self.getTerminal() reward = self.getReward(terminal) #print "reward:", reward # store new experience in replay memory if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None: self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal) # if terminal state (out of track), then restart game if terminal: #print "terminal state, restarting" self.control.setMeta(1) return self.control.toMsg() else: self.control.setMeta(0) # choose actions for wheel and speed epsilon = self.getEpsilon() if self.enable_exploration and random.random() < epsilon: #print "random move" steer = random.randrange(self.num_steers) #speed = random.randrange(self.num_speeds) speed = random.randint(2, self.num_speeds-1) else: # use broadcasting to efficiently produce minibatch of desired size minibatch = state + np.zeros((self.minibatch_size, 1)) Q = self.net.predict(minibatch) assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape) #print "steer Q: ", Q[0,:self.num_steers] #print "speed Q:", Q[0,-self.num_speeds:] steer = np.argmax(Q[0, :self.num_steers]) speed = np.argmax(Q[0, -self.num_speeds:]) if self.show_qvalues: self.plotq.update(Q[0]) #print "steer:", steer, "speed:", speed # gears are always automatic gear = self.gear() # set actions self.setSteerAction(steer) self.setGearAction(gear) self.setSpeedAction(speed) # remember state and actions self.prev_state = state self.prev_steer = steer self.prev_speed = speed #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count #print "reward:", reward, "epsilon:", epsilon return self.control.toMsg() def gear(self): rpm = self.state.getRpm() gear = self.state.getGear() if self.prev_rpm == None: up = True else: if (self.prev_rpm - rpm) < 0: up = True else: up = False if up and rpm > 7000 and gear < 6: gear += 1 if not up and rpm < 3000 and gear > 0: gear -= 1 return gear def setSteerAction(self, steer): assert 0 <= steer <= self.num_steers self.control.setSteer(self.steers[steer]) def setGearAction(self, gear): assert -1 <= gear <= 6 self.control.setGear(gear) def setSpeedAction(self, speed): assert 0 <= speed <= self.num_speeds accel = self.speeds[speed] if accel >= 0: #print "accel", accel self.control.setAccel(accel) self.control.setBrake(0) else: #print "brake", -accel self.control.setAccel(0) self.control.setBrake(-accel) def onShutDown(self): if self.save_weights_prefix: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") if self.save_replay: self.mem.save(self.save_replay) if self.save_csv: self.csv_file.close() def onRestart(self): self.prev_rpm = None self.prev_dist = None self.prev_state = None self.prev_steer = None self.prev_speed = None self.frame = -1 if self.episode > 0: dist = self.state.getDistRaced() self.distances.append(dist) epsilon = self.getEpsilon() print "Episode:", self.episode, "\tDistance:", dist, "\tMax:", max(self.distances), "\tMedian10:", np.median(self.distances[-10:]), \ "\tEpsilon:", epsilon, "\tReplay memory:", self.mem.count if self.save_weights_prefix and self.save_interval > 0 and self.episode % self.save_interval == 0: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") #self.mem.save(self.save_weights_prefix + "_" + str(self.episode) + "_replay.pkl") if self.save_csv: self.csv_writer.writerow([ self.episode, self.state.getDistFromStart(), self.state.getDistRaced(), self.state.getCurLapTime(), self.state.getLastLapTime(), self.state.getRacePos(), epsilon, self.mem.count, self.total_train_steps ]) self.csv_file.flush() self.episode += 1
def __init__(self, sim_config, agent_config, ID=0, strategy='Agent', training_flag=True): # parameters super().__init__(sim_config, ID, strategy) self.num_history = agent_config['agent_history'] self.training_flag = training_flag ## エージェントの行動の集合 self.enable_actions = list([i] for i in range(self.num_peer) if i != self.ID) self.enable_actions.insert(0, []) self.num_actions = self.num_peer ## 各パラメータの値を格納(詳細はtraining_config.pyに記載) self.minibatch_size = int(agent_config['minibatch_size']) self.learning_rate = float(agent_config['learning_rate']) self.discount_factor = float(agent_config['discount_factor']) self.max_D_size = int(agent_config['max_D_size']) self.min_D_size = int(agent_config['min_D_size']) self.network_update_frequency = int( agent_config['network_update_frequency']) self.epsilon_decaying_states = int( agent_config['epsilon_decaying_states']) self.min_epsilon = float(agent_config['min_epsilon']) self.reward_config = str(agent_config['reward_config']) self.momentum = float(agent_config['momentum']) self.opt_epsilon = float(agent_config['opt_epsilon']) # replay memory self.D = deque(maxlen=self.max_D_size) # variables self.current_loss = 0.0 self.current_Q_max = 0.0 self.num_total_states = 0 self.Q_max = 0 self.action_t = [] self.reward_t = [0] self.action_t_past = 0 # model self.graph = tf.Graph() with self.graph.as_default(): #input_layer self.input_data = agent_config['input_data'] if self.input_data == 'upload' or self.input_data == 'download': self.width = self.num_history self.height = self.num_peer - 1 elif self.input_data == 'upload_and_download': self.width = self.num_history self.height = (self.num_peer - 1) * 2 self.num_channels = int(agent_config['num_channels']) # train_network self.tf_train_input = tf.placeholder( tf.float32, shape=(self.minibatch_size, self.height, self.width, self.num_channels)) self.tf_train_target = tf.placeholder(tf.float32, shape=(self.minibatch_size, self.num_actions)) self.tf_filter_input = tf.placeholder(tf.float32, shape=(self.minibatch_size, self.num_actions)) self.train_network = DeepQNetwork(self.width, self.height, self.num_actions, agent_config) ### #self.train_q_values = self.train_network.q_values(self.tf_train_input) # target_network self.tf_target_input = tf.placeholder( tf.float32, shape=(self.minibatch_size, self.height, self.width, self.num_channels)) #self.tf_target_input = tf.placeholder(tf.float32, shape=(1, self.height, self.width, self.num_channels)) #self.tf_target_input = tf.placeholder(tf.float32, [self.minibatch_size, width, height]) self.target_network = DeepQNetwork(self.width, self.height, self.num_actions, agent_config) self.target_q_values = self.target_network.q_values( self.tf_target_input) # アクション選択用のプレースホルダー self.tf_action_selection_input = tf.placeholder( tf.float32, shape=(1, self.height, self.width, self.num_channels)) self.action_q_values = self.train_network.q_values( self.tf_action_selection_input) # loss function self.loss = self.train_network.clipped_loss( self.tf_train_input, self.tf_train_target, self.tf_filter_input) # optimizer self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate, momentum=self.momentum, epsilon=self.opt_epsilon, name='RMSProp') self.training = self.optimizer.minimize(self.loss, name='training') self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.update_target_network()
class DQNAgent(Peer): def __init__(self, sim_config, agent_config, ID=0, strategy='Agent', training_flag=True): # parameters super().__init__(sim_config, ID, strategy) self.num_history = agent_config['agent_history'] self.training_flag = training_flag ## エージェントの行動の集合 self.enable_actions = list([i] for i in range(self.num_peer) if i != self.ID) self.enable_actions.insert(0, []) self.num_actions = self.num_peer ## 各パラメータの値を格納(詳細はtraining_config.pyに記載) self.minibatch_size = int(agent_config['minibatch_size']) self.learning_rate = float(agent_config['learning_rate']) self.discount_factor = float(agent_config['discount_factor']) self.max_D_size = int(agent_config['max_D_size']) self.min_D_size = int(agent_config['min_D_size']) self.network_update_frequency = int( agent_config['network_update_frequency']) self.epsilon_decaying_states = int( agent_config['epsilon_decaying_states']) self.min_epsilon = float(agent_config['min_epsilon']) self.reward_config = str(agent_config['reward_config']) self.momentum = float(agent_config['momentum']) self.opt_epsilon = float(agent_config['opt_epsilon']) # replay memory self.D = deque(maxlen=self.max_D_size) # variables self.current_loss = 0.0 self.current_Q_max = 0.0 self.num_total_states = 0 self.Q_max = 0 self.action_t = [] self.reward_t = [0] self.action_t_past = 0 # model self.graph = tf.Graph() with self.graph.as_default(): #input_layer self.input_data = agent_config['input_data'] if self.input_data == 'upload' or self.input_data == 'download': self.width = self.num_history self.height = self.num_peer - 1 elif self.input_data == 'upload_and_download': self.width = self.num_history self.height = (self.num_peer - 1) * 2 self.num_channels = int(agent_config['num_channels']) # train_network self.tf_train_input = tf.placeholder( tf.float32, shape=(self.minibatch_size, self.height, self.width, self.num_channels)) self.tf_train_target = tf.placeholder(tf.float32, shape=(self.minibatch_size, self.num_actions)) self.tf_filter_input = tf.placeholder(tf.float32, shape=(self.minibatch_size, self.num_actions)) self.train_network = DeepQNetwork(self.width, self.height, self.num_actions, agent_config) ### #self.train_q_values = self.train_network.q_values(self.tf_train_input) # target_network self.tf_target_input = tf.placeholder( tf.float32, shape=(self.minibatch_size, self.height, self.width, self.num_channels)) #self.tf_target_input = tf.placeholder(tf.float32, shape=(1, self.height, self.width, self.num_channels)) #self.tf_target_input = tf.placeholder(tf.float32, [self.minibatch_size, width, height]) self.target_network = DeepQNetwork(self.width, self.height, self.num_actions, agent_config) self.target_q_values = self.target_network.q_values( self.tf_target_input) # アクション選択用のプレースホルダー self.tf_action_selection_input = tf.placeholder( tf.float32, shape=(1, self.height, self.width, self.num_channels)) self.action_q_values = self.train_network.q_values( self.tf_action_selection_input) # loss function self.loss = self.train_network.clipped_loss( self.tf_train_input, self.tf_train_target, self.tf_filter_input) # optimizer self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate, momentum=self.momentum, epsilon=self.opt_epsilon, name='RMSProp') self.training = self.optimizer.minimize(self.loss, name='training') self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.update_target_network() def update_target_network(self): self.train_network.copy_network_to(self.target_network, self.sess) def select_random_action(self, neighbor_leecher_list, unchoke_num): unchoke_num = self.calculate_unchoke_num(unchoke_num) return random.sample(neighbor_leecher_list, unchoke_num) def get_q_max(self): if self.num_history > len(self.download_history): return 0 else: state = self.get_cur_history() state = np.reshape(state, [1, self.height, self.width, self.num_channels]) q_values = self.action_q_values.eval( session=self.sess, feed_dict={self.tf_action_selection_input: state})[0] return np.max(q_values) def select_greedy_action(self, state, neighbor_leecher_list): q_values = self.action_q_values.eval( session=self.sess, feed_dict={self.tf_action_selection_input: state})[0] q_index = np.argsort(q_values)[::-1] for q in q_index: if self.enable_actions[q] == []: return [] elif self.enable_actions[q][0] in neighbor_leecher_list: return self.enable_actions[q] #self.action_t = self.enable_actions[q_index[0]] # action_t = [] # for i in q_index: # if i in neighbor_leecher_list: # action_t.append(self.enable_actions[i]) # if len(action_t) == self.num_unchoke: # return action_t # elif i == self.ID: # [action_t.append(self.enable_actions[i]) # return action_t def upload(self, status_list): self.enable_actions = list([i] for i in range(self.num_actions) if i != self.ID) self.enable_actions.insert(0, []) #print(self.enable_actions) epsilon = self.calculate_epsilon() neighbor_leecher_list = [ i for i in range(len(status_list)) if status_list[i] != 'Seeder' and i != self.ID ] self.unchoke_probability = 1 - (1 / (len(neighbor_leecher_list) + 1)) unchoke_num = min(self.num_unchoke, len(neighbor_leecher_list)) ### 隣接リーチャがいない時 if neighbor_leecher_list == []: self.action_t = [] return self.action_t ### 十分なデータが集まっていない時 elif len(self.download_history ) <= self.num_history or not self.has_enough_memory(): self.action_t = self.select_random_action(neighbor_leecher_list, unchoke_num) return self.action_t else: ### イプシロンの確率でランダム戦略 if random.random() <= epsilon: self.action_t = self.select_random_action( neighbor_leecher_list, unchoke_num) return self.action_t else: ### 直近の履歴情報を返す state = self.get_cur_history() state = np.reshape( state, [1, self.height, self.width, self.num_channels]) ### Q値によって選択 self.action_t = self.select_greedy_action( state, neighbor_leecher_list) return self.action_t def store_experience(self, state, action, reward, state_1, terminal): self.D.append((state, action, reward, state_1, terminal)) def experience_replay(self): state_minibatch = [] action_minibatch = [] reward_minibatch = [] state_1_minibatch = [] terminal_minibatch = [] # sample random minibatch minibatch_size = min(len(self.D), self.minibatch_size) minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size) for j in minibatch_indexes: state_j, action_j, reward_j, state_j_1, terminal = self.D[j] action_j_index = action_j state_minibatch.append(state_j) action_minibatch.append(action_j) reward_minibatch.append(reward_j) state_1_minibatch.append(state_j_1) terminal_minibatch.append(terminal) state_minibatch = np.reshape( state_minibatch, [self.minibatch_size, self.height, self.width, self.num_channels]) state_1_minibatch = np.reshape( state_1_minibatch, [self.minibatch_size, self.height, self.width, self.num_channels]) ### target_networkでQ値を算出(教師データ) ### train_q_valuesはtarget_networkを使って学習を行わない(2013年版DQN) #target_qs = self.train_q_values.eval(feed_dict={self.tf_train_input: state_1_minibatch}) target_qs = self.target_q_values.eval( session=self.sess, feed_dict={self.tf_target_input: state_1_minibatch}) target = np.zeros(shape=(self.minibatch_size, self.num_actions), dtype=np.float32) q_value_filter = np.zeros(shape=(self.minibatch_size, self.num_actions), dtype=np.float32) for i in range(self.minibatch_size): terminal = terminal_minibatch[i] action_index = action_minibatch[i] #print(action_index) reward = reward_minibatch[i] target[i][ action_index] = reward if terminal else reward + self.learning_rate * np.max( target_qs[i]) q_value_filter[i][action_index] = 1.0 _, self.current_loss = self.sess.run( [self.training, self.loss], feed_dict={ self.tf_train_input: state_minibatch, self.tf_train_target: target, self.tf_filter_input: q_value_filter }) def load_model(self, file_path): self.train_network.restore_parameters(self.sess, file_path) ### CNNのモデルを保存 def save_model(self, save_path, num_episode): if not os.path.exists(os.path.join(save_path, 'model', 'train_network')): os.makedirs(os.path.join(save_path, 'model', 'train_network')) if not os.path.exists( os.path.join(save_path, 'model', 'target_network')): os.makedirs(os.path.join(save_path, 'model', 'target_network')) self.train_network.save_parameters( self.sess, save_path + '/model/train_network/train_network', num_episode) self.target_network.save_parameters( self.sess, save_path + '/model/target_network/train_network', num_episode) def get_cur_history(self): len_his = len(self.download_history) cur_up = copy.deepcopy(self.upload_history[len_his - self.num_history:]) cur_up = delete_row(cur_up, self.ID) cur_down = copy.deepcopy(self.download_history[len_his - self.num_history:]) cur_down = delete_row(cur_down, self.ID) if self.input_data == 'upload': return np.transpose(cur_up) elif self.input_data == 'download': return np.transpose(cur_down) elif self.input_data == 'upload_and_download': up_down = np.r_[cur_up, cur_down] up_down = up_down.transpose() if np.shape(up_down) == (self.num_peer - 1, self.num_history * 2): #print('ok ',np.shape(up_down)) up_down = up_down.reshape((self.num_peer - 1) * 2, self.num_history) else: #print('no ', np.shape(up_down)) up_down = None return up_down def calculate_epsilon(self): if self.epsilon_decaying_states == 0: return self.min_epsilon else: return max( self.min_epsilon, 1.0 - (self.num_total_states / self.epsilon_decaying_states)) def has_enough_memory(self): return len(self.D) >= self.min_D_size def next_step(self): if self.num_history < len( self.download_history) and self.training_flag: self.num_total_states += 1 #時刻tの状態, 行動, 報酬 state_t = self.state_t_past action_t = self.action_t_past state_t_1 = self.get_cur_history() #現在の状態を保存しておき, 次のステップで用いる self.state_t_past = state_t_1 self.action_t_past = self.action_t self.reward_t = [] if self.reward_config == 'all_download': self.reward_t.append(sum(self.current_download)) elif self.reward_config == 'each_download': for a_t in action_t: self.reward_t.append(self.current_download[a_t]) elif self.reward_config == 'each_download_penalty': for a_t in action_t: each_download = self.current_download[a_t] if a_t == self.ID: self.reward_t.append(0) elif each_download >= 1: self.reward_t.append(1) else: self.reward_t.append(-1) reward_t = self.reward_t #状態を更新 super().next_step() terminal = self.is_seeder() ## 行動とアクションを再生メモリに格納 self.store_experience(state_t, [self.enable_actions.index(action_t)], reward_t, state_t_1, terminal) ## 十分なメモリが揃ったら学習を開始 if self.has_enough_memory(): if (self.num_total_states % self.network_update_frequency) == 0: print('-----update_network------') self.update_target_network() self.experience_replay() else: self.state_t_past = self.get_cur_history() self.action_t_past = self.action_t super().next_step() ### オブジェクトをリセットしないため, このメソッドを呼び出して次のエピソードへ移行する. def reset_history(self): #持っているピース.デフォルトは0 self.have_piece = 0 #隣接ピアへのアップロード履歴を保存する変数(多分0か1しか検討しなさそうやけど) self.current_upload = [0 for i in range(self.num_peer)] #隣接ピアからの直近のダウンロード履歴を保存する変数 #初期はすべて1を入れてランダムに行うようにする(0入れて一番古いのを1にするのもあり) self.current_download = [0 for i in range(self.num_peer)] #隣接ピアからのダウンロード履歴を保存する変数 self.upload_history = [[0 for i in range(self.num_peer)]] #隣接ピアからのアップロード履歴を保存する変数 self.download_history = [[1 for i in range(self.num_peer)]] self.strategy = 'Agent'
def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.pretrained_network = args.pretrained_network self.steer_lock = 0.785398 self.max_speed = 100 self.algorithm = args.algorithm self.device = args.device self.mode = args.mode self.maxwheelsteps = args.maxwheelsteps self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) if self.device == 'wheel': from wheel import Wheel self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force)
class Driver(object): ''' A driver object for the SCRC ''' def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.pretrained_network = args.pretrained_network self.steer_lock = 0.785398 self.max_speed = 100 self.algorithm = args.algorithm self.device = args.device self.mode = args.mode self.maxwheelsteps = args.maxwheelsteps self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) if self.device == 'wheel': from wheel import Wheel self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force) def init(self): '''Return init string with rangefinder angles''' self.angles = [0 for x in range(19)] for i in range(5): self.angles[i] = -90 + i * 15 self.angles[18 - i] = 90 - i * 15 for i in range(5, 9): self.angles[i] = -20 + (i-5) * 5 self.angles[18 - i] = 20 - (i-5) * 5 return self.parser.stringify({'init': self.angles}) def getState(self): #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()]) #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0 state = np.array(self.state.getTrack()) / 200.0 assert state.shape == (self.num_inputs,) return state def getReward(self, terminal): if terminal: reward = -1000 else: dist = self.state.getDistFromStart() if self.prev_dist is not None: reward = max(0, dist - self.prev_dist) * 10 assert reward >= 0, "reward: %f" % reward else: reward = 0 self.prev_dist = dist #reward -= self.state.getTrackPos() #print "reward:", reward return reward def getTerminal(self): return np.all(np.array(self.state.getTrack()) == -1) def getEpsilon(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def drive(self, msg): # parse incoming message self.state.setFromMsg(msg) # show sensors if self.show_sensors: self.stats.update(self.state) # fetch state, calculate reward and terminal indicator state = self.getState() terminal = self.getTerminal() reward = self.getReward(terminal) #print "reward:", reward # store new experience in replay memory if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None: self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal) # if terminal state (out of track), then restart game if terminal: print "terminal state, restarting" self.control.setMeta(1) return self.control.toMsg() else: self.control.setMeta(0) # choose actions for wheel and speed if self.enable_exploration and random.random() < self.getEpsilon(): #print "random move" steer = random.randrange(self.num_steers) #speed = random.randrange(self.num_speeds) speed = random.randint(2, self.num_speeds-1) elif self.algorithm == 'network': # use broadcasting to efficiently produce minibatch of desired size minibatch = state + np.zeros((self.minibatch_size, 1)) Q = self.net.predict(minibatch) assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape) #print "steer Q: ", Q[0,:21] #print "speed Q:", Q[0,-5:] steer = np.argmax(Q[0, :self.num_steers]) speed = np.argmax(Q[0, -self.num_speeds:]) if self.show_qvalues: self.plotq.update(Q[0]) elif self.algorithm == 'hardcoded': steer = self.getSteerAction(self.steer()) speed = self.getSpeedActionAccel(self.speed()) else: assert False, "Unknown algorithm" #print "steer:", steer, "speed:", speed # gears are always automatic gear = self.gear() # check for manual override # might be partial, so we always need to choose algorithmic actions first events = self.wheel.getEvents() if self.mode == 'override' and self.wheel.supportsDrive(): # wheel for event in events: if self.wheel.isWheelMotion(event): self.wheelsteps = self.maxwheelsteps if self.wheelsteps > 0: wheel = self.wheel.getWheel() steer = self.getSteerAction(wheel) self.wheelsteps -= 1 # gas pedal accel = self.wheel.getAccel() if accel > 0: speed = self.getSpeedActionAccel(accel) # brake pedal brake = self.wheel.getBrake() if brake > 0: speed = self.getSpeedActionBrake(brake) # check for wheel buttons always, not only in override mode for event in events: if self.wheel.isButtonDown(event, 2): self.algorithm = 'network' self.mode = 'override' self.wheel.generateForce(0) print "Switched to network algorithm" elif self.wheel.isButtonDown(event, 3): self.net.load_weights(self.pretrained_network) self.algorithm = 'network' self.mode = 'ff' self.enable_training = False print "Switched to pretrained network" elif self.wheel.isButtonDown(event, 4): self.enable_training = not self.enable_training print "Switched training", "ON" if self.enable_training else "OFF" elif self.wheel.isButtonDown(event, 5): self.algorithm = 'hardcoded' self.mode = 'ff' print "Switched to hardcoded algorithm" elif self.wheel.isButtonDown(event, 6): self.enable_exploration = not self.enable_exploration self.mode = 'override' self.wheel.generateForce(0) print "Switched exploration", "ON" if self.enable_exploration else "OFF" elif self.wheel.isButtonDown(event, 7): self.mode = 'ff' if self.mode == 'override' else 'override' if self.mode == 'override': self.wheel.generateForce(0) print "Switched force feedback", "ON" if self.mode == 'ff' else "OFF" elif self.wheel.isButtonDown(event, 0) or self.wheel.isButtonDown(event, 8): gear = max(-1, gear - 1) elif self.wheel.isButtonDown(event, 1) or self.wheel.isButtonDown(event, 9): gear = min(6, gear + 1) # set actions self.setSteerAction(steer) self.setGearAction(gear) self.setSpeedAction(speed) # turn wheel using force feedback if self.mode == 'ff' and self.wheel.supportsForceFeedback(): wheel = self.wheel.getWheel() self.wheel.generateForce(self.control.getSteer()-wheel) # remember state and actions self.prev_state = state self.prev_steer = steer self.prev_speed = speed # training if self.enable_training and self.mem.count >= self.minibatch_size: minibatch = self.mem.getMinibatch() self.net.train(minibatch) self.total_train_steps += 1 #print "total_train_steps:", self.total_train_steps #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count return self.control.toMsg() def setSteerAction(self, steer): self.control.setSteer(self.steers[steer]) def setGearAction(self, gear): assert -1 <= gear <= 6 self.control.setGear(gear) def setSpeedAction(self, speed): accel = self.speeds[speed] if accel >= 0: #print "accel", accel self.control.setAccel(accel) self.control.setBrake(0) else: #print "brake", -accel self.control.setAccel(0) self.control.setBrake(-accel) def getSteerAction(self, wheel): steer = np.argmin(np.abs(np.array(self.steers) - wheel)) return steer def getSpeedActionAccel(self, accel): speed = np.argmin(np.abs(np.array(self.speeds) - accel)) return speed def getSpeedActionBrake(self, brake): speed = np.argmin(np.abs(np.array(self.speeds) + brake)) return speed def steer(self): angle = self.state.angle dist = self.state.trackPos steer = (angle - dist*0.5)/self.steer_lock return steer def gear(self): rpm = self.state.getRpm() gear = self.state.getGear() if self.prev_rpm == None: up = True else: if (self.prev_rpm - rpm) < 0: up = True else: up = False if up and rpm > 7000: gear += 1 if not up and rpm < 3000: gear -= 1 return gear def speed(self): speed = self.state.getSpeedX() accel = self.prev_accel if speed < self.max_speed: accel += 0.1 if accel > 1: accel = 1.0 else: accel -= 0.1 if accel < 0: accel = 0.0 self.prev_accel = accel return accel def onShutDown(self): pass def onRestart(self): if self.mode == 'ff': self.wheel.generateForce(0) self.prev_rpm = None self.prev_accel = 0 self.prev_dist = None self.prev_state = None self.prev_steer = None self.prev_speed = None self.wheelsteps = 0 if self.save_weights_prefix and self.episode > 0: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") self.episode += 1 print "Episode", self.episode
def add(self, observation): observation = cv2.resize(cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY), self.dims) self.memory[0, :-1] = self.memory[0, 1:] self.memory[0, -1] = np.array(observation) def get_action(self, t, observation): self.add(observation) if t < self.history_length or random.random() < self.exploration_rate_test: action = env.action_space.sample() else: qvalues = net.predict(memory) action = np.argmax(qvalues[0]) return action env = gym.make(args.env_id) net = DeepQNetwork(env.action_space.n, args) memory = np.empty((args.batch_size, args.history_length, args.screen_height, args.screen_width)) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) agent = GymAgent(env, net, memory, args) env.monitor.start(args.output_folder, force=True) avg_reward = 0 num_episodes = 100 for i_episode in xrange(num_episodes): observation = env.reset() i_total_reward = 0 for t in xrange(10000):