def main(): sess = get_session() env = atari_env(FLAGS.seed, FLAGS.game) action_space = env.action_space.n logdir = './results/' + FLAGS.game + '_' + FLAGS.arch + '_seed' + str(FLAGS.seed) + '_' + FLAGS.exp + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") flags.DEFINE_string('logdir', logdir + '/', 'logdir') if not os.path.exists(logdir): os.makedirs(logdir) if FLAGS.arch == 'DQN': algo = model.DQN(num_actions = action_space, lr = FLAGS.lr, opt = FLAGS.opt, gamma = FLAGS.gamma, arch = FLAGS.arch) elif FLAGS.arch == 'C51': algo = model.C51(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads) elif FLAGS.arch == 'QR_DQN': algo = model.QR_DQN(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads) elif FLAGS.arch == 'ENS_DQN': algo = model.ENS_DQN(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads) elif FLAGS.arch == 'REM_DQN': algo = model.REM_DQN(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads) if FLAGS.online: dqnsolver = solver.DQNsolver(env, sess, algo, FLAGS) else: dqnsolver = solver.offlineDQNsolver(env, sess, algo, FLAGS) dqnsolver.train()
def __init__(self): available_models = ["dqn", "double_dqn", "dueling_double_dqn"] if config.rl_model not in available_models: raise Exception("specified model is not available.") if config.rl_model == "dqn": self.model = model.DQN() elif config.rl_model == "double_dqn": self.model = model.DoubleDQN() elif config.rl_model == "dueling_double_dqn": self.model = model.DuelingDoubleDQN() self.exploration_rate = config.rl_initial_exploration self.total_steps = 0 self.total_steps_overall = 0 self.total_time = 0 self.start_time = time.time() gui.controller.glue = self gui.canvas.glue = self self.state = np.zeros( (config.initial_num_cars, config.rl_history_length, 34), dtype=np.float32) self.prev_state = self.state.copy() self.last_action = np.zeros((config.initial_num_cars, ), dtype=np.uint8) self.sum_loss = 0 self.sum_reward = 0 self.evaluation_phase = False self.population_phase = True
def __init__(self, memory_capacity=1000000, gamma=0.99, input_dims=(4, 84, 84), output_dim=6, lr=0.000025): self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # DQN model self.model = model.DQN(input_dims, output_dim, lr).to(self.device) # DQN target self.target = model.DQN(input_dims, output_dim, lr).to(self.device) self.target.load_state_dict(self.model.state_dict()) # Loss function is Huber loss, https://en.wikipedia.org/wiki/Huber_loss, feel free to change to MSE self.loss = lambda expected, target: F.smooth_l1_loss(expected, target) # Agents Experience Replay Memory self.memory = memory.ExperienceReplay(memory_capacity) # gamma hyperparam for calculating loss self.gamma = gamma
def train(): # 初始化网络 dqn = model.DQN() print(dqn) # # 获取计算设备 # if torch.cuda.is_available(): # device = torch.device('cuda:0') # num_gpu = torch.cuda.device_count() # if num_gpu > 1: # dqn = nn.DataParallel(dqn) # print('Using %d GPU...' % num_gpu) # else: # device = torch.device('cpu') # print('Using CPU...') # # 网络转移到设备上 # dqn.to(device) print('\nCollecting experience...') for epoch in range(400): s0 = dataset.env.reset() # s0 = s0.to(device) ep_r = 0 while True: dataset.env.render() a = dqn.choose_action(s0) # take action s1, r, done, info = dataset.env.step(a) # modify the reward 因为reward一直是1 x, x_dot, theta, theta_dot = s1 r1 = (dataset.env.x_threshold - abs(x)) / dataset.env.x_threshold - 0.8 r2 = (dataset.env.theta_threshold_radians - abs(theta)) / dataset.env.theta_threshold_radians - 0.5 r = r1 + r2 # s0,a,r,s1 = s0.to(device),a.to(device),r.to(device),s1.to(device) dqn.store_transition(s0, a, r, s1) ep_r += r if dqn.memory_counter > dataset.MEMORY_CAPACITY: dqn.learn() if done: print('Ep:', epoch, '\tEp_r:', round(ep_r, 2)) if done: break s0 = s1
def train_hierarchical_dqn(episodes, agent): '''For comparison with RL agent that learns the overall task''' #Create model loss = [] hierarchical_model = md.DQN(2,4,'Hierarchical') for epi in range(episodes): try: #agent.reset(False) subgoal_reached = False target_count = 0 agent.reset(True) print(f'spawn suceeded----------') state = [0,100,90,0] state = np.reshape(state, (1,4)) score = 0 max_step = 250 for i in range(max_step): choice = hierarchical_model.act(state) action = choose_action_hierarchical(choice) print(f"action ------------------> {action}") next_state, reward, done, subgoal_reached, _ = agent.step_hierarchical(action,target_count) if subgoal_reached: target_count = 1 print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}----{target_count}-----{subgoal_reached}') time.sleep(0.3) score += reward next_state = np.reshape(next_state, (1, 4)) hierarchical_model.remember(state, choice, reward, next_state, done) state = next_state hierarchical_model.replay(done,epi,loss) if done: print("episode: {}/{}, score: {}".format(epi, episodes, score)) break loss.append(score) # Average score of last 100 episode is_solved = 0 if len(loss)>=100: is_solved = np.mean(loss[-100:]) if is_solved > 150: print('\n Task Completed! \n') break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) finally: hierarchical_model.save_model() if agent != None: agent.destroy() time.sleep(5) return loss
def train_right_lane_change(episodes,agent): #Create model loss = [] right_lane_model = md.DQN(4,4,'Right_Lane') for epi in range(episodes): try: agent.reset(True) #State space normal distance y difference and x difference state = [5,5,5,5] state = np.reshape(state, (1,4)) score = 0 max_step = 1000 for i in range(max_step): choice = right_lane_model.act(state) action = choose_action_rightlanechange(choice) if(epi>=epi_count and epi_count<210): if(agent.lane_id_ego!=agent.lane_id_target and action[1]<0): action[1] = 0.21 choice = 0 elif agent.lane_id_ego==agent.lane_id_target: if(agent.yaw_vehicle>1): action[1] = -0.14 choice = 1 print(f"action 222------------------> {action}") next_state, reward, done, _ = agent.step_rightlanechange(action) time.sleep(0.1) score += reward next_state = np.reshape(next_state, (1, 4)) right_lane_model.remember(state, choice, reward, next_state, done) state = next_state right_lane_model.replay(done,epi,loss) if done: print("episode: {}/{}, score: {}".format(epi, episodes, score)) break loss.append(score) # Average score of last 100 episode is_solved = 0 if len(loss)>=100: is_solved = np.mean(loss[-100:]) if is_solved > 1050: print('\n Task Completed! \n') break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) finally: right_lane_model.save_model() if agent != None: agent.destroy() time.sleep(1) return loss
def train_overall_dqn(episodes, agent): '''For comparison with RL agent that learns the overall task''' #Create model loss = [] overall_model = md.DQN(4,3,'Overall') for epi in range(episodes): try: #agent.reset(False) agent.reset(True) print(f'spawn suceeded----------') state = [0,50,0] state = np.reshape(state, (1,3)) score = 0 max_step = 100 for i in range(max_step): choice = overall_model.act(state) action = choose_action_overall(choice) print(f"action ------------------> {action}") next_state, reward, done, _ = agent.step_overall(action) print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}') time.sleep(0.5) score += reward next_state = np.reshape(next_state, (1, 3)) overall_model.remember(state, choice, reward, next_state, done) state = next_state overall_model.replay(done,epi,loss) if done: print("episode: {}/{}, score: {}".format(epi, episodes, score)) break loss.append(score) # Average score of last 100 episode is_solved = 0 if len(loss)>=100: is_solved = np.mean(loss[-100:]) if is_solved > 500: print('\n Task Completed! \n') break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) finally: overall_model.save_model() if agent != None: agent.destroy() time.sleep(5) return loss
def train_left_DQN(episodes,agent): #Create model loss = [] left_turn_model = md.DQN(4,4,'Left_Turn') #corrected from 2 to 3 for epi in range(episodes): try: #agent.reset(False) agent.reset(True) traffic_light = None print(f'spawn suceeded----------') #Get the first state state = [50,90,0,0] state = np.reshape(state, (1,4)) score = 0 max_step = 200 for i in range(max_step): choice = left_turn_model.act(state) action = choose_action_leftturn(choice) print(f"action ------------------> {action}") next_state, reward, done, _ = agent.step_leftturn(action) print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}') time.sleep(0.5) score += reward next_state = np.reshape(next_state, (1, 4)) left_turn_model.remember(state, choice, reward, next_state, done) state = next_state left_turn_model.replay(done,epi,loss) if done: print("episode: {}/{}, score: {}".format(epi, episodes, score)) break loss.append(score) # Average score of last 100 episode is_solved = 0 if len(loss)>=100: is_solved = np.mean(loss[-100:]) if is_solved > 800: print('\n Task Completed! \n') break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) finally: left_turn_model.save_model() if agent != None: agent.destroy() time.sleep(1) return loss
def demo(num_episode=1): eps = 0.01 env_raw = make_atari(args.env_name) env = wrap_deepmind(env_raw) c, h, w = m.fp(env.reset()).shape n_actions = env.action_space.n policy_net = m.DQN(h, w, n_actions, device).to(device) if device == "cuda": policy_net.load_state_dict( torch.load("models/" + args.env_name.replace("NoFrameskip-v4", "") + "_best.pth")) else: policy_net.load_state_dict(torch.load("models/"+args.env_name.replace("NoFrameskip-v4","")+\ "_best.pth", map_location=torch.device('cpu'))) policy_net.eval() sa = m.ActionSelector(eps, eps, policy_net, 100, n_actions, device) q = deque(maxlen=5) e_rewards = [] for eee in range(num_episode): print("Demo episode %d/%d" % (eee + 1, num_episode) + "...") env.reset() e_reward = 0 for _ in range(5): # no-op n_frame, _, done, _ = env.step(0) n_frame = m.fp(n_frame) q.append(n_frame) while not done: if num_episode <= 1: env.render() time.sleep(0.02) state = torch.cat(list(q))[1:].unsqueeze(0) action, eps = sa.select_action(state, False) n_frame, reward, done, _ = env.step(action) n_frame = m.fp(n_frame) q.append(n_frame) e_reward += reward e_rewards.append(e_reward) avg_reward = float(sum(e_rewards)) / float(num_episode) env.close() print("Average reward of " + args.env_name + " is %.1f" % (avg_reward)) print("Average std of " + args.env_name + " is %.1f" % (np.std(e_rewards)))
def train_straight_DQN(episodes,agent): #Create model loss = [] straight_model = md.DQN(5,4,'Straight_Model') for epi in range(episodes): try: agent.reset(True) #Get the first state (speed, distance from junction) state = [0,100,0,15] state = np.reshape(state, (1,4)) score = 0 max_step = 500 for i in range(max_step): choice = straight_model.act(state) action = choose_action_straight(choice) print(f"action ------------------> {action}") next_state, reward, done, _ = agent.step_straight(action) print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}') time.sleep(0.5) score += reward next_state = np.reshape(next_state, (1, 4)) straight_model.remember(state, choice, reward, next_state, done) state = next_state straight_model.replay(done,epi,loss) if done: print("episode: {}/{}, score: {}".format(epi, episodes, score)) break loss.append(score) # Average score of last 100 episode is_solved = np.mean(loss[-100:]) if is_solved > 1000: print('\n Task Completed! \n') break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) finally: straight_model.save_model() if agent != None: agent.destroy() time.sleep(1) return loss
def train_left_lane_change(episodes,agent): #Create model loss = [] left_lane_model = md.DQN(4,4,'Left_Lane') for epi in range(episodes): try: agent.reset(True) #State space normal distance y difference and x difference state = [5,5,5,5] state = np.reshape(state, (1,4)) score = 0 max_step = 1000 for i in range(max_step): choice = left_lane_model.act(state) action = choose_action_leftlanechange(choice) next_state, reward, done, _ = agent.step_leftlanechange(action) time.sleep(0.1) score += reward next_state = np.reshape(next_state, (1, 4)) left_lane_model.remember(state, choice, reward, next_state, done) state = next_state left_lane_model.replay(done,epi,loss) if done: print("episode: {}/{}, score: {}".format(epi, episodes, score)) break loss.append(score) # Average score of last 100 episode is_solved = 0 if len(loss)>=100: is_solved = np.mean(loss[-100:]) if is_solved > 1200: print('\n Task Completed! \n') break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) finally: left_lane_model.save_model() if agent != None: agent.destroy() time.sleep(10) return loss
def train(params): global_step = tf.contrib.framework.get_or_create_global_step() #initialize model, environment and experience--------- env = md.Environment(params['frame_skip'], params['game_name']) mod = md.DQN(params) args = [ params['load_prev'], params['input_size'], params['frame_stack'], params['max_epi'], params['replay_start'], params['exp_file'] ] exp = md.Experience(*args) #get all arguments from args list #----------------------------------------------------- #-----Part 1--------- frame_stack_ph = tf.placeholder(tf.uint8, [params['frame_stack']] + params['orig_inp']) #frame stack placeholder preprocess = mod.preprocess(frame_stack_ph) #preprocessed input israndom_ph = tf.placeholder(tf.bool) #placeholder for getting random action action = mod.get_action([preprocess], israndom=israndom_ph) #keep in mind that action is of size [1] #here, should run action and store experience into Experience data #----Part 2---------- #get batch of state,action,reward,new state,done state_shape = [params['batch_size'] ] + params['input_size'] + [params['frame_stack']] #shape that state (and new state) is in state_ph = tf.placeholder(tf.uint8, shape=state_shape) #state placeholder action_ph = tf.placeholder(tf.int64, shape=[params['batch_size']]) reward_ph = tf.placeholder(tf.float32, shape=[params['batch_size']]) new_state_ph = tf.placeholder(tf.uint8, shape=state_shape) done_ph = tf.placeholder(tf.bool, shape=[params['batch_size']]) batch_ph = [state_ph, action_ph, reward_ph, new_state_ph, done_ph] #batch_ph is not a placeholder itself, but a collection of placeholders train_opt = mod.train(global_step, batch_ph) assign_list = mod.switch_params() #------training session----------- if params['load_prev']: saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(params['checkpoint_dir']) with tf.train.MonitoredTrainingSession( checkpoint_dir=params['checkpoint_dir']) as sess: if params['load_prev']: saver.restore(sess, ckpt.model_checkpoint_path) #document steps eps_step = 0 #number of episodes that passed time_step = 0 #steps after experience replay has started total_start_time = time.time() total_step = 0 #steps in total while eps_step <= params['step_cap']: mod.init_frame_stack() #initialize frame stack x1 = env.reset( ) #start the environment and get initial observation eps_run_time = time.time() #start the runtime of an episode step_in_ep = 0 #steps passed in current episode mod.add_frame(x1) #add initial observation into the stack total_r = 0 while True: #part 1--------- experience_dict = {} israndom_val = random.random() <= mod.rand_act_prob[0] #get a random bool experience_dict['state'], [experience_dict['action'] ] = sess.run([preprocess, action], feed_dict={ frame_stack_ph: mod.get_stack(), israndom_ph: israndom_val }) #get state and action values if mod.rand_act_prob > params['rand_action'][1]: mod.rand_act_prob[0] -= mod.rand_act_prob[1] new_unprocessed_state_val, experience_dict[ 'reward'], experience_dict['done'] = env.run( experience_dict['action']) mod.add_frame(new_unprocessed_state_val) experience_dict['new_state'] = sess.run( preprocess, feed_dict={frame_stack_ph: mod.get_stack()}) exp.add_exp(experience_dict) #add experience #part 2--------- batch_val = exp.get_batch(params['batch_size']) if not batch_val is None: sess.run([train_opt], feed_dict={ batch_ph[i]: batch_val[i] for i in range(len(batch_ph)) }) if not time_step % params['target_update']: sess.run(assign_list) time_step += 1 total_step += 1 step_in_ep += 1 total_r += experience_dict['reward'] if experience_dict['done']: cur_eps_run_time = ut.timer(time.time() - eps_run_time) total_run_time = ut.timer(time.time() - total_start_time) string = "episodes ran: %d,steps ran in episode: %d, Total steps taken: %d,reward: %.4f,episode run time:%s,total run time:%s" print string % (eps_step, step_in_ep, total_step, total_r, cur_eps_run_time, total_run_time) break eps_step += 1
# Main function if __name__ == '__main__': # set unity environment path (file_name) env = UnityEnvironment(file_name=config.env_name) # env = UnityEnvironment(file_name=config.env_name, worker_id=np.random.randint(100000)) # setting brain for unity default_brain = env.brain_names[0] brain = env.brains[default_brain] train_mode = config.train_mode device = config.device model_ = model.DQN(config.action_size, "main").to(device) target_model_ = model.DQN(config.action_size, "target").to(device) model_RND = model.RND(config.action_size, "RND").to(device) models = [model_, model_RND] # optimizer 에 넣기 위하여 RND model 에서 학습을 진행할 parameter 만 뽑음 param_active_list = [] param_frozen_list = [] for name, param in model_RND.named_parameters(): if str(name).startswith('model_active'): param_active_list.append(param) elif str(name).startswith('model_frozen'): param.requires_grad = False param_frozen_list.append(param)
required=True, help="Enter Model File") parser.add_argument( "--env", default=DEFAULT_ENV_NAME, help="Enter environment name, default: {}".format(DEFAULT_ENV_NAME)) parser.add_argument("-r", "--record", help="Enter directory to store recorded video") args = parser.parse_args() env = wrappers.make_env(args.env) if args.record: env = gym.wrappers.Monitor(env, args.record) net = model.DQN(env.observation_space.shape, env.action_space.n) net.load_state_dict(torch.load(args.model)) state = env.reset() total_reward = 0.0 while True: start_time = time.time() env.render() state_v = torch.tensor(np.array([state], copy=False)) q_vals = net(state_v).data.numpy()[0] action = np.argmax(q_vals) state, reward, done, _ = env.step(action) total_reward += reward if done:
"batch_size" : 32, "replay_initial" : 10000, "capacity" : 100000, "reward_steps" : 1, }, } params = HYPERPARAMS["breakout"] scores, eps_history = [], [] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env = wrappers.make_env(params["env_name"]) policy_network = model.DQN(env.observation_space.shape, env.action_space.n).to(device) target_network = ptan.agent.TargetNet(policy_network) optimizer = optim.Adam(policy_network.parameters(), lr=params["learning_rate"]) action_selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params["eps_start"]) agent = ptan.agent.DQNAgent(policy_network, action_selector, device) epsilon_tracker = EpsilonTracker(action_selector, params) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params["gamma"], steps_count=params["reward_steps"]) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params["capacity"]) writer = SummaryWriter("run") current_step = 0 with utils.RewardTracker(writer, params) as reward_tracker:
if render: time_to_sleep = wait_time - (time.time() - start_time) if time_to_sleep > 0: time.sleep(time_to_sleep) return total_reward if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Render on graphics card(cuda:0).") parser.add_argument("--env", default=ENV_NAME, help="Name of the environment, default=" + ENV_NAME) parser.add_argument("-m", "--model", help="DQN") args = parser.parse_args() device = torch.device(GRAPHICS_CARD if args.cuda else "cpu") env = wrappers.make_atari(args.env) env = wrappers.wrap_deepmind(env, False, False, True) net = model.DQN(4, env.action_space.n).to(device) net.load_state_dict(torch.load(args.model)) score = play(env, net, True, device) print(f"Score: {score}")
output = DQN(x_stack) loss = criterion(output, y_stack) optimizer.zero_grad() loss.backward() optimizer.step() return loss env = gym.make('CartPole-v0') input_size = env.observation_space.shape[0] output_size = env.action_space.n model = model.DQN(input_size, output_size, [10]) criterion = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) # Make replay buffer REPLAY_MEMORY_SIZE = 50000 replay_buffer = deque() gamma = 0.9 num_episodes = 5000 rList = [] loss_list = [] model.train() for i in range(num_episodes):
# 3. environment reset env_name = args.env_id.replace( "NoFrameskip-v4", "") if "NoFrameskip-v4" in args.env_id else args.env_id.replace( "-ramNoFrameskip-v4", "") env_raw = make_atari(args.env_id) env = wrap_deepmind(env_raw, frame_stack=False, episode_life=True, clip_rewards=True) c, h, w = m.fp(env.reset()).shape n_actions = env.action_space.n # 4. Network reset policy_net = m.DQN(h, w, n_actions, device).to(device) target_net = m.DQN(h, w, n_actions, device).to(device) policy_net.apply( policy_net.init_weights ) # apply函数会把init_weights函数作用在每一个子模块上,如果更换了模型结构也可以不用更改inti函数,这就是apply的好处 target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # 5. DQN hyperparameters BATCH_SIZE = 32 GAMMA = 0.99 EPS_START = 1. EPS_END = 0.1 EPS_DECAY = 1000000 TARGET_UPDATE = 10000 NUM_STEPS = 15000000
"--env", default=ENV_NAME, type=str, help="Name of environment, Default: {}".format(ENV_NAME)) parser.add_argument( "--reward", type=float, default=MEAN_REWARD_BOUND, help="Mean reward boundary to stop training, Default: {:.2f}".format( MEAN_REWARD_BOUND)) args = parser.parse_args() device = torch.device("cuda" if args.gpu else "cpu") env = wrappers.make_env(args.env) net = model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = model.DQN(env.observation_space.shape, env.action_space.n).to(device) writer = SummaryWriter(logdir="logs", comment="-" + args.env) # print(net) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimier = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0 best_mean_reward = None
def run_model(self, obs): data = Data('day', 'minute', '1', 'true', self.ticker, '10') prices = data.get_prices_formatted() env = enviornment.StocksEnv(prices) net = model.DQN(env.observation_space.shape[0], env.action_space.n) net.load_state_dict(torch.load('RL\saves\(episode-60800.000.data')) total_reward = 0.00 total_balance = 10000 step_idx = 0 balance = [] rewards = [] profit = [] epochs = self.epoch epoch_step = 1 while epoch_step <= epochs: obs = env.reset() while True: step_idx += 1 obs_v = torch.tensor([obs]) out_v = net(obs_v) action_idx = out_v.max(dim=1)[1].item() if np.random.random() < self.epsilon: action_idx = env.action_space.sample() action = enviornment.Actions(action_idx) if action == enviornment.Actions.Buy and not env._state.have_position: start_price = env._state.curr_close() total_balance -= start_price balance.append(total_balance) obs, reward, done, _ = env.step(action_idx) total_reward += reward rewards.append(total_reward) if step_idx % 100 == 0: print("Epoch %d, Step_idx: %d reward = %.3f" % (epoch_step, step_idx, total_reward)) if done: profit_recieved = (env._state.curr_close() - start_price) / start_price profit.append(profit_recieved) total_balance += env._state.curr_close() balance.append(total_balance) break epoch_step += 1 file_name_profit = 'RL\logs\profits\profit_%s_%s.txt' % ( self.ticker, str(time.time())) file_name_reward = 'RL\logs\\rewards\\reward_{}_{}.txt'.format( self.ticker, str(time.time())) file_name_balance = 'RL\logs\\balances\\balance_{}_{}.txt'.format( self.ticker, str(time.time())) with open(file_name_profit, 'w') as f: f.writelines('%s,' % x for x in profit) with open(file_name_reward, 'w') as f: f.writelines('%s,' % x for x in rewards) with open(file_name_balance, 'w') as f: f.writelines('%s,' % x for x in balance)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--env", default=ENV_NAME, help="Name of the environment, default=" + ENV_NAME) args = parser.parse_args() device = torch.device(GRAPHICS_CARD if args.cuda else "cpu") env = wrappers.make_atari(args.env) env = wrappers.wrap_deepmind(env, episode_life=False, frame_stack=True) exp_buffer = ExperienceBuffer(REPLAY_MEMORY_SIZE) agent = Agent(env, exp_buffer) net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device) tgt_net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device) tgt_net.load_state_dict(net.state_dict()) criterion = nn.MSELoss() optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, momentum=GRAD_MOMENTUM, eps=MIN_SQ_GRAD) writer = SummaryWriter(comment="-" + args.env) remaining_time_buffer = collections.deque(maxlen=100) last_100_rewards_training = collections.deque(maxlen=100) last_100_rewards_test = collections.deque(maxlen=100) episode_idx = 0 frame_idx = 0