def _log_best_network_env_info(self, net, summary_writer, env, test_env, gen=1): # train infos = self.best_network.get_env_info(env, False) print('Traning Data (best) ->> ', infos) for key in infos: summary_writer.add_summary(tfSummary(key, float(infos[key])), global_step=gen) infos = net.get_env_info(env, False) print('Traning Data (latest) ->> ', infos) # Test if test_env: infos = self.best_network.get_env_info(test_env, False) print('Test Data (best)->> ', infos) for key in infos: summary_writer.add_summary(tfSummary('test_' + key, float(infos[key])), global_step=gen) infos = net.get_env_info(test_env, False) print('Test Data (latest)->> ', infos)
def train(self, env, args, summary_writer): results = [] # First, gather experience # tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") # for e in tqdm_e: for e in range(args.nb_episodes): # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] noise = OrnsteinUhlenbeckProcess(size=self.act_dim) while not done: if args.render: env.render() # env.render() # Actor picks an action (following the deterministic policy) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a + noise.generate(time), -self.act_range, self.act_range) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Add outputs to memory buffer self.memorize(old_state, a, r, done, new_state) # Sample experience from buffer states, actions, rewards, dones, new_states, _ = self.sample_batch( args.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) # Update current state old_state = new_state cumul_reward += r time += 1 # Gather stats every episode for plotting if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() print("score", score) # Display score # tqdm_e.set_description("Score: " + str(cumul_reward)) # tqdm_e.refresh() return results
def train(self, env, args, summary_writer): """ Main DDQN Training Algorithm """ results = [] tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() while not done: if args.render: env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state) # print('action ', a) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # print('reward', r) # Memorize for experience replay self.memorize(old_state, a, r, done, new_state) # Update current state old_state = new_state cumul_reward += r time += 1 # Train DDQN and transfer weights to target network if (self.buffer.size() > args.batch_size): # print('train agent') self.train_agent(args.batch_size) self.agent.transfer_weights() # print('memory buffer:', self.buffer.size()) # Gather stats every episode for plotting if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() # print('e', e) if (e % self.save_interval == 0) & (e != 0): # print('save') self.save_weights(self.export_path, e) return results
def train(self, env, args, summary_writer): """ Main A2C Training Algorithm """ results = [] # Main Loop tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] while not done: if args.render: env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Memorize (s, a, r) for training actions.append(to_categorical(a, self.act_dim)) rewards.append(r) states.append(old_state) # Update current state old_state = new_state cumul_reward += r time += 1 # Train using discounted rewards ie. compute updates self.train_models(states, actions, rewards, done) # Gather stats every episode for plotting if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results
def train(self, env, args, summary_writer): """ Main DDQN Training Algorithm DDQN主要训练算法 """ results = [] tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode 重设episode time, cumul_reward, done = 0, 0, False old_state = env.reset() while not done: if args.render: env.render() # Actor picks an action (following the policy) 演员选择动作(遵循政策) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal 检索新状态,奖励以及该状态是否为终端 new_state, r, done, _ = env.step(a) # Memorize for experience replay 保存经验重播 self.memorize(old_state, a, r, done, new_state) # Update current state 更新当前状态 old_state = new_state cumul_reward += r time += 1 # Train DDQN and transfer weights to target network 训练DDQN并将权重转移到目标网络 if (self.buffer.size() > args.batch_size): self.train_agent(args.batch_size) self.agent.transfer_weights() # Gather stats every episode for plotting 收集每个情节的统计数据以进行绘图 if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard 为Tensorboard导出结果 score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score 显示分数 tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results
def training_thread(agent, Nmax, env, action_dim, f, summary_writer, tqdm, render): """ Build threads to run shared computation across """ global episode while episode < Nmax: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] while not done and episode < Nmax: if render: with lock: env.render() # Actor picks an action (following the policy) a = agent.policy_action(np.expand_dims(old_state, axis=0)) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Memorize (s, a, r) for training actions.append(to_categorical(a, action_dim)) rewards.append(r) states.append(old_state) # Update current state old_state = new_state cumul_reward += r time += 1 # Asynchronous training if (time % f == 0 or done): lock.acquire() agent.train_models(states, actions, rewards, done) agent.global_rewards.append(cumul_reward) lock.release() actions, states, rewards = [], [], [] # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=episode) summary_writer.flush() # Update episode count with lock: tqdm.set_description("Score: " + str(cumul_reward)) tqdm.update(1) if (episode < Nmax): episode += 1
def train(self, env, args, summary_writer, envtest=None): """ Main DDQN Training Algorithm """ results = [] tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") epoch = 0 gross_profit = 0 WritetoCsvFile("logFile_1.csv", [ "stage", "file", "history_win", "stop", "usevol", "dueling", "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2", "madifference", "hidema", "candlenum", "hidden_dim", "maxProfit", "maxLOSS", "avgProfit", "avgLOSS", "countprofit", "countloss", "maxdrop", "Total profit", "total_reward", "TRADES", "epoch" ]) WritetoCsvFile("logFileDetail.csv", [ "stage", "file", "history_win", "stop", "usevol", "dueling", "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2", "madifference", "hidema", "candlenum", "hidden_dim", 'maxProfit', 'maxLOSS', 'avgProfit', 'avgLOSS', 'maxdrop', 'Total profit', 'gross profit', "total_reward", 'TRADES', 'epoch' ]) for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() ########################################## total_reward = 0 total_profit = 0 total_loss = 0 total_profitMax = 0 total_profitMin = 0 max_drop = 0 profitLst = [] lossLst = [] trades = 0 step = 0 #####################################3#### while not done: #if args.render: env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal #new_state, r, done, _ = env.step(a) ####################################################### new_state, r, done, buy, sell, profit = env.step(a) total_reward += r if profit != 0: trades += 1 total_profit += profit if total_profit > total_profitMax: total_profitMax = total_profit total_profitMin = total_profit if total_profit < total_profitMin: total_profitMin = total_profit try: if total_profitMax != 0 and max_drop < ( total_profitMax - total_profitMin) / total_profitMax: max_drop = (total_profitMax - total_profitMin) / total_profitMax except: max_drop = 0 if profit > 0: profitLst.append(profit) elif profit < 0: lossLst.append(profit) step += 1 if step % 1500 == 0: print( 'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}/{} TRADES: {} ' .format(np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), max_drop, total_profit, gross_profit, trades)) WritetoCsvFile("logFileDetail.csv", [ "train", args.trainf, args.history_win, args.stop, args.usevol, args.dueling, args.traineval, args.allprices, args.allprices2, args.allprices3, args.ma1, args.ma2, args.madifference, args.hidema, args.candlenum, args.hidden_dim, np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), max_drop, total_profit, gross_profit, total_reward, trades, epoch ]) #done = True if step == len(env.data) - 3 else False ###################################################### # Memorize for experience replay self.memorize(old_state, a, r, done, new_state) # Update current state old_state = new_state cumul_reward += r time += 1 # Train DDQN and transfer weights to target network if (self.buffer.size() > args.batch_size): self.train_agent(args.batch_size) self.agent.transfer_weights() gross_profit += total_profit # Gather stats every episode for plotting if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) l_profit = tfSummary('profit', total_profit) l_aprofit = tfSummary('average profit', np.mean(profitLst)) l_aloss = tfSummary('l_aloss', -np.mean(lossLst)) l_trades = tfSummary('l_trades', trades) np.mean(profitLst), -np.mean(lossLst) summary_writer.add_summary(score, global_step=e) summary_writer.add_summary(l_profit, global_step=e) summary_writer.add_summary(l_aprofit, global_step=e) summary_writer.add_summary(l_aloss, global_step=e) summary_writer.add_summary(l_trades, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() self.agent.saveModel("./models/model_ep", "") results = [ np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), len(profitLst), len(lossLst), max_drop, total_profit, total_reward, trades ] WritetoCsvFile("logFile_1.csv", [ "train", args.trainf, args.history_win, args.stop, args.usevol, args.dueling, args.traineval, args.allprices, args.allprices2, args.allprices3, args.ma1, args.ma2, args.madifference, args.hidema, args.candlenum, args.hidden_dim ] + results + [epoch]) if envtest: # Если задано окружение для тестирования то тестируем каждую эпоху newargs = args newargs.traineval = False self.evaluate(envtest, newargs, summary_writer, model=None, epoch=epoch) epoch += 1 return results
def train(self, summary_writer): env = CarEnv() results = [] i = 0 # First, gather experience tqdm_e = tqdm(range(2000), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() old_state = np.array(old_state).reshape(40, ) actions, states, rewards = [], [], [] noise = OrnsteinUhlenbeckProcess(size=self.act_dim) while not done: # if args.render: env.render() # Actor picks an action (following the deterministic policy) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a + noise.generate(time), -self.act_range, self.act_range) a = float(a[0]) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a, time) print("Now r is {}".format(r)) # Add outputs to memory buffer temp_next = old_state.copy() temp_next[:4] = temp_next[4:8] temp_next[4:8] = temp_next[8:12] temp_next[8:12] = temp_next[12:16] temp_next[12:16] = temp_next[16:20] temp_next[16:20] = temp_next[20:24] temp_next[20:24] = temp_next[24:28] temp_next[24:28] = temp_next[28:32] temp_next[28:32] = temp_next[32:36] temp_next[32:36] = temp_next[36:40] temp_next[36:40] = new_state temp_next = np.array(temp_next).reshape(40, ) self.memorize(old_state, a, r, done, temp_next) old_state = temp_next.copy() cumul_reward += r time += 1 # since episode is over destroying actors in the scenario for actor in env.actor_list: actor.destroy() # Sample experience from buffer for i in range(50): states, actions, rewards, dones, new_states, _ = self.sample_batch( 64) # Predict target q-values using target networks q_values = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) print("learning happened") # mean, stdev, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data = gather_stats(self, env, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data) mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard print(cumul_reward) score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() i += 1 if i % 10 == 0: df = pd.DataFrame(np.array(results)) df.to_csv("DDPG" + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') return results
def fit(self, env, summary_writer, debug=False, num_cpus=4, is_market=False, env_args={}, test_env_args=None, env_version='v1'): stagnation = 1 best_so_far = 0 # Init test env test_env = None if env_version == 'v1': test_env = MarketEnvironmentV1( **test_env_args) if test_env_args else None if env_version == 'v2': test_env = MarketEnvironmentV2( **test_env_args) if test_env_args else None envs = [] # Create environements for all population if is_market: if env_version == 'v1': envs = [ MarketEnvironmentV1(**env_args) for i in range(self.population_size) ] if env_version == 'v2': envs = [ MarketEnvironmentV2(**env_args) for i in range(self.population_size) ] else: envs = [ Environment(**env_args) for i in range(self.population_size) ] # Iterating over all generations tqdm_e = tqdm(total=self.generations, desc='Generation', leave=True, unit=" gen") for gen_i in range(self.generations): # Doing our evaluations args = [(self, self.networks[i], envs[i]) for i in range(self.population_size)] with Pool(num_cpus) as p: rewards = np.array(p.map(_run_par_evaluate, args)) # Tracking best score per generation self.fitness.append(np.max(rewards)) # Selecting the best network best_network = np.argmax(rewards) # Selecting top n networks n = int(self.survival_ratio * self.population_size) top_n_index = np.argsort(rewards)[-n:] # Creating our child networks new_networks = [] for _ in range(self.population_size - n): # Origin will take -> 0 if both parent -> 1 if one parent and -> 2 if just get another network from previous run origin = np.random.choice([0, 1, 2], p=[ self.both_parent_percentage, self.one_parent_percentage, 1 - self.both_parent_percentage - self.one_parent_percentage ]) # both parents if origin == 0: new_net = NeuralNet(parent1=self.networks[random.randint( 0, len(top_n_index) - 1)], parent2=self.networks[random.randint( 0, len(top_n_index) - 1)], var=self.mutation_variance) # One parent elif origin == 1: new_net = NeuralNet(parent1=self.networks[random.randint( 0, len(top_n_index) - 1)], parent2=None, var=self.mutation_variance) else: # Copy from other run (aside from the choosen best) index = top_n_index[0] while index not in top_n_index: index = random.randint(0, len(self.networks) - 1) new_net = self.networks[index] new_networks.append(new_net) # Setting our new networks maintain_best_n = [self.networks[i] for i in top_n_index] self.networks = maintain_best_n + new_networks # Export results for Tensorboard r_max = rewards.max() r_mean = rewards.mean() r_std = rewards.std() self.insert_info(r_max, r_mean, r_std) summary_writer.add_summary(tfSummary('Max rewards', r_max), global_step=gen_i) summary_writer.add_summary(tfSummary('Mean rewards', r_mean), global_step=gen_i) summary_writer.add_summary(tfSummary('STD rewards', r_std), global_step=gen_i) # Update stagnation if r_max > best_so_far: best_so_far = r_max stagnation = 1 else: stagnation += 1 #Update tqdm tqdm_e.set_description('Generation:' + str(gen_i + 1) + '| Highest Reward:' + str(r_max) + '| Average Reward:' + str(r_mean) + '| std Reward: ' + str(r_std) + '| Stagnation: ' + str(stagnation) + '| Population size: ' + str(len(self.networks))) # Save current weights self.best_network = self.networks[best_network] if debug: self._log_best_network_env_info(maintain_best_n[0], summary_writer, envs[0], test_env, gen_i) self.save_weights(gen_i, maintain_best_n[0], self.save_path) # Update logs summary_writer.flush() tqdm_e.update(1) tqdm_e.refresh() # Se estiver estagnado por muito tempo, eu paro if stagnation > 10 and self.stagnation_end: break # Close the environments [e.close() for e in envs] # Returning the best network self.best_network = self.networks[best_network] return self.global_info
def train(self, env, args, summary_writer): """ Main A2C Training Algorithm """ # self.pretrain_random(env, args, summary_writer) results = [] possible_states = [np.asarray(0), np.asarray(1)] # Main Loop tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] while not done: if args.render: env.render() # if (e%64==1)&(e>30): # if args.render: env.render() # Actor picks an action (following the policy) if e < 30: a = [ random.choice(possible_states), random.choice(possible_states), random.choice(possible_states), random.choice(possible_states) ] elif np.random.rand() < 0.5: a = [ random.choice(possible_states), random.choice(possible_states), random.choice(possible_states), random.choice(possible_states) ] else: a = self.policy_action(old_state, e) #feedforward # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) #self.c_opt([states, discounted_rewards]) # print(novelty) # Memorize (s, a, r) for training # actions.append(to_categorical(a, self.act_dim)) actions.append(a) states.append(old_state) # compute the novelty # print(self.env_dim[1]) last_state = states[-1].reshape((1, 4, self.env_dim[1])) novelty = 0 # novelty=self.rnd_opt([last_state,last_state])[0] rewards.append(r + 0.0001 * novelty) # Update current state old_state = new_state cumul_reward += r + 0.0001 * novelty time += 1 # Train using discounted rewards ie. compute updates self.her.add(states, np.asarray(actions), rewards) # print(np.asarray(states).shape, np.asarray(actions).shape, np.asarray(rewards).shape) # only update every 10 episodes? if e > 24: for item in self.her.sample(): states, actions, rewards, completed = item # print(np.asarray(states).shape, np.asarray(actions).shape, np.asarray(rewards).shape) states = np.asarray(states)[-min(1000, len(rewards)):] actions = np.asarray(actions)[-min(1000, len(rewards)):] rewards = np.asarray(rewards)[-min(1000, len(rewards)):] self.train_models(states, actions, rewards, completed) # try: # for item in self.her.sample(): # states, actions, rewards, completed=item # print(np.asarray(states).shape, np.asarray(actions).shape, np.asarray(rewards).shape) # states=np.asarray(states)[-max(1000, len(rewards)):] # actions=np.asarray(actions)[-max(1000, len(rewards)):] # rewards=np.asarray(rewards)[-max(1000, len(rewards)):] # self.train_models(states, actions, rewards, completed) # except: # print('error training critic') # for item in self.her.sample(): # states, actions, rewards, completed=item # self.train_models(states, np.asarray(actions), rewards, done) # # self.train_models(states, np.asarray(actions), rewards, done) # Gather stats every episode for plotting if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results
def pretrain_random(self, env, args, summary_writer, train_steps=200, env_steps=100): """ Generate a somewhat random output so that the agent explores. """ results = [] # Main Loop tqdm_e = tqdm(range(train_steps), desc='pretrain', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] old_a = np.asarray(np.zeros_like(self.policy_action(old_state, e))) while not done: # if args.render: env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state, e) # if np.random.rand()<0.1: # print(a) # print(a) #feedforward # Retrieve new state, reward, and whether the state is terminal new_state, _, done, _ = env.step(a) r = np.random.choice( ((np.asarray(a).reshape(-1) - old_a.reshape(-1))**2)[:2]) old_a = np.asarray(a) #self.c_opt([states, discounted_rewards]) # Memorize (s, a, r) for training actions.append(to_categorical(a, self.act_dim)) rewards.append(r) states.append(old_state) # compute the novelty last_state = states[-1].reshape((1, 4, 4)) novelty = self.rnd_opt([last_state, last_state]) # Update current state old_state = new_state cumul_reward += r time += 1 # Train using discounted rewards ie. compute updates try: self.train_models(states, np.asarray(actions), rewards, done) except: print('error training critic') self.train_models(states, np.asarray(actions), rewards, done) # Gather stats every episode for plotting if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score:{}, Nov.: {}".format( str(cumul_reward), novelty)) tqdm_e.refresh()