class TensorforceAgent: def __init__(self,actions): preprocessing_config = [ { "type": "grayscale" } ] exploration_config = dict( type="epsilon_anneal", initial_epsilon=0.25, final_epsilon=0.01, timesteps=1000000 ) network_spec = [ dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'), dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'), dict(type='flatten'), dict(type='dense', size=256, activation='lrelu') ] self.network_path = "network/" self.agent = PPOAgent( actions = dict(type='int', num_actions=len(actions)), states = dict(type='float', shape=(35, 150, 3)), network = network_spec, actions_exploration = exploration_config, states_preprocessing = preprocessing_config ) def act(self, obs): #Cut out only the part needed partly = np.delete(obs, np.s_[96:], 0) partly = np.delete(partly, np.s_[0:26], 0) partly = np.delete(partly, np.s_[35:45], 0) partly = np.delete(partly, np.s_[38:53], 0) partly = np.delete(partly, np.s_[31:35], 0) partly = np.delete(partly, np.s_[10:16], 0) frame = np.delete(partly, np.s_[150:], 1) #scipy.misc.imsave('outfile.jpg', frame) return self.agent.act(frame) def load(self): import os if os.path.isdir(self.network_path): try: self.agent.restore_model(self.network_path) except: print("Failed to load model") def observe(self, terminal = False, reward = 0): return self.agent.observe(terminal, reward) def save_model(self): import os if not os.path.isdir(self.network_path): os.makedirs(self.network_path) self.agent.save_model(self.network_path)
def main(): env = gym.make('CartPole-v0') # (4,) print(env.observation_space.shape) # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38] print(env.observation_space.high) # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] print(env.observation_space.low) # 2 print(env.action_space.n) agent = PPOAgent( states=dict(type='float', shape=env.observation_space.shape), network=[ dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu'), ], actions=dict(type='int', num_actions=env.action_space.n), step_optimizer=dict(type='adam', learning_rate=1e-4) ) model_dir = 'models/cartpole' if os.path.exists(f'{model_dir}/checkpoint'): agent.restore_model(directory=model_dir) try: for ep in range(2000): observation = env.reset() done = False ep_reward = 0 while not done: # env.render() states = observation / 4 action = agent.act(states=states) observation, reward, done, info = env.step(action) agent.observe(reward=reward, terminal=done) ep_reward += reward if done: print(f'ep = {ep}, ep_reward = {ep_reward}') except Exception as e: raise e finally: agent.save_model(directory=f'{model_dir}/agent')
class ForwardActor: def __init__(self): actions = {} for i in range(12): actions[str(i)] = {'type': 'float'} # 'num_actions': 10 network_spec = [ dict(type='dense', size=100, activation='relu'), dict(type='dense', size=100, activation='relu') ] self.agent = PPOAgent( states=dict(type='float', shape=(12, )), actions=actions, batching_capacity=2000, network=network_spec, step_optimizer=dict(type='adam', learning_rate=1e-4), ) def act(self, state): jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])), axis=0) jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0) #actiondict = self.agent.act( np.concatenate([jp,jv],axis=1)) actiondict = self.agent.act(jp) action = np.zeros(12) for i in range(12): action[i] = actiondict[str(i)][0] action = np.nan_to_num(action) #print(action) return np.clip(action, -1.0, 1.0) def observe(self, reward, terminal): self.agent.observe(reward=reward, terminal=terminal) def save(self, directory): self.agent.save_model(directory=directory) def restore(self, directory): self.agent.restore_model(directory=directory)
# PPOAgent likelihood_ratio_clipping=0.2, step_optimizer=dict( type='adam', learning_rate=2.5 * 1e-3 ), subsampling_fraction=0.0625, optimization_steps=50, execution=dict( type='single', session_config=None, distributed_spec=None ) ) agent.restore_model(directory='./load/') print('Model restored') #print(agent.act(states=np.zeros(environment.states['shape']), deterministic=False, buffered=False, independent = True)) # TODO : Create my own runner with multithreading # Create the runner runner = Runner(agent=agent, environment=environment) # Load latest checkpoint if RENDER: input('Ready to start.') runner.run(episodes=1, max_episode_timesteps=20, deterministic = False) print('Score achieved : {}'.format(np.array(runner.episode_rewards[-1]) / np.array(runner.episode_timesteps[-1]))) runner.close()
while True: if game_name == None: print( "You're starting the training with Unity Editor. You can test the correct interactions between " "Unity and Tensorforce, but for a complete training you must start it with a built environment." ) # Close the environment if environment != None: environment.close() # If model name is not None, restore the parameters if use_model == 'y': directory = os.path.join(os.getcwd(), "saved/") agent.restore_model(directory, model_name) # Open the environment with all the desired flags environment = UnityEnvWrapper(game_name, no_graphics=True, seed=int(time.time()), worker_id=work_id, with_stats=args.with_stats, size_stats=11, size_global=10, agent_separate=False, with_class=False, with_hp=False, with_previous=lstm, verbose=False, manual_input=False)
optimization_steps=25, execution=dict( type='single', session_config=None, distributed_spec=None ) ) restore_path = None if(os.path.exists("saved_models/checkpoint")): restore_path = './saved_models' if restore_path is not None: printi("restore the model") agent.restore_model(restore_path) else : print('Trained Network not found...') if(os.path.exists("saved_models/test_strategy.csv")): os.remove("saved_models/test_strategy.csv") if(os.path.exists("saved_models/test_strategy_avg.csv")): os.remove("saved_models/test_strategy_avg.csv") def one_run(): printi("start simulation") state = environment.reset() environment.render = True null_action = np.zeros(environment.actions['shape'])
] states = env.states, actions = env.actions, network = dense_lstm_net agent = PPOAgent(states=env.states, actions=env.actions, network=dense_lstm_net, update_mode=dict(unit='episodes', batch_size=35), memory=dict(type='latest', include_next_states=False, capacity=(164 * 35 * 54 * 4)), step_optimizer=dict(type='adam', learning_rate=1e-4)) agent.restore_model(directory='smaLSTM') # Create the runner runner = Runner(agent=agent, environment=env) lofasz = 0 # Callback function printing episode statistics t = list() rew = list() modelSaves = 1 def episode_finished(r):
class SerpentPPO: def __init__(self, frame_shape=None, game_inputs=None): if frame_shape is None: raise SerpentError("A 'frame_shape' tuple kwarg is required...") states_spec = {"type": "float", "shape": frame_shape} if game_inputs is None: raise SerpentError("A 'game_inputs' dict kwarg is required...") self.game_inputs = game_inputs self.game_inputs_mapping = self._generate_game_inputs_mapping() actions_spec = {"type": "int", "num_actions": len(self.game_inputs)} network_spec = [ {"type": "conv2d", "size": 1, "window": 2, "stride": 1}, {"type": "flatten"}, # {"type": "dense", "size": 64}, {"type": "dense", "size": 6} ] self.agent = PPOAgent( states=states_spec, actions=actions_spec, network=network_spec, batched_observe=256, batching_capacity=1000, # BatchAgent #keep_last_timestep=True, # PPOAgent step_optimizer=dict( type='adam', learning_rate=1e-4 ), optimization_steps=10, # Model scope='ppo' #discount=0.97, # DistributionModel #distributions=None, #entropy_regularization=0.01, # PGModel #baseline_mode=None, #baseline=None, #baseline_optimizer=None, #gae_lambda=None, # PGLRModel #likelihood_ratio_clipping=None, #summary_spec=summary_spec, #distributed_spec=None, # More info #device=None, #session_config=None, #saver=None, #variable_noise=None, #states_preprocessing_spec=None, #explorations_spec=None, #reward_preprocessing_spec=None, #execution=None, #actions_exploration=None, #update_mode=None, #memory=None, #subsampling_fraction=0.1 ) def generate_action(self, game_frame_buffer): states = np.stack( game_frame_buffer, axis=2 ) # Get prediction from agent, execute action = self.agent.act(states) label = self.game_inputs_mapping[action] return action, label, self.game_inputs[label] def observe(self, reward=0, terminal=False): self.agent.observe(reward=reward, terminal=terminal) def _generate_game_inputs_mapping(self): mapping = dict() for index, key in enumerate(self.game_inputs): mapping[index] = key return mapping def save_model(self): self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman", "ppo_model"), append_timestep=False) def restore_model(self): self.agent.restore_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman"))
#distributions_spec=None, entropy_regularization=0.01, # PGModel baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, # PGLRModel likelihood_ratio_clipping=0.2, #summary_spec=None, #distributed_spec=None ) path = os.getcwd() print(path) try: agent.restore_model(path) except: pass # Create the runner #runner = ThreadedRunnerMod(agent=agent, environment=env, save_frequency=100, save_frequency_unit='e') runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print( "Finished episode {ep} after {ts} timesteps ({d} days) (reward: {reward})" .format(ep=r.episode, ts=r.episode_timestep, d=int(r.episode_timestep / 24),
network=[ dict(type='dense', size=256), dict(type='dense', size=256), dict(type='dense', size=256) ], update_mode=dict(unit='episodes', batch_size=10), # PGModel baseline_mode='states', baseline=dict(type='mlp', sizes=[256, 256, 256]), baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-3), num_steps=5), gae_lambda=0.97, step_optimizer=dict(type='adam', learning_rate=1e-4)) agent.restore_model('./models', 'net-297000-0.57-5126573') runner = Runner(agent=agent, environment=environment) start_time = time.perf_counter() def episode_finished(r): if r.episode % 100 == 0: sps = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}" .format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Episode timesteps: {}".format(r.episode_timestep)) #logger.info("Episode largest tile: {}".format(r.environment.largest_tile))
momentum=0.9), num_steps=10), gae_lambda=0.95, # PPOAgent likelihood_ratio_clipping=0.2, step_optimizer=dict(type='momentum', learning_rate=learning_rate, momentum=0.9), subsampling_fraction=0.0625, discount=0.95, optimization_steps=50, execution=dict(type='single', session_config=None, distributed_spec=None)) #print('Agent created') # Synchronize with master agent agent.restore_model(directory=save_path) #print('Agent restored') # Synchronize comm.Barrier() #print('Yo') episode = 0 data_buffer = [] if process_id == 0: pbar = tqdm.tqdm(total=nprocs * batch_allocation) # Run this single worker (episode loop) as long as episode threshold have not been reached. while not should_stop: state = env.reset() #print('Calling reset') agent.reset()
], batching_capacity=4096, step_optimizer=dict(type='adam', learning_rate=1e-3), optimization_steps=10, scope='ppo', discount=0.99, entropy_regularization=0.01, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, likelihood_ratio_clipping=0.2, ) if "--resume" in sys.argv: agent.restore_model(directory="models/") runner = Runner(agent=agent, environment=env) def episode_finished(r): print("[{ep}] @ {ts}ts -> \t{reward}".format(ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) training_progress.append(r.episode_rewards[-1]) if r.episode % 100 == 0: env.visualize = True agent.save_model(directory="models/") plt.scatter(range(len(training_progress)), training_progress, s=1) plt.title("Cart Pole Training Progress\n3-layer 10-neurons/layer ReLU") plt.xlabel("Episodes")
class ForwardActorSimple: def __init__(self): actions = {} actions_exp = {} for i in range(12): actions[str(i)] = {'type': 'float'} # 'num_actions': 10 actions_exp[str(i)] = dict(type='ornstein_uhlenbeck', sigma=0.1, mu=0.0, theta=0.1) preprocessing_config = [{"type": "standardize"}] preprocessing_config = None customnet = dict(type=CustomNetwork) layerSize = 300 network_spec = [ dict(type='dense', size=100), dict(type='lstm', size=100) ] ''' network_spec = [ dict(type='dense', size=100), dict(type='internal_lstm', size=100) ] ''' network_spec = [ dict(type='dense', size=layerSize, activation='selu'), dict(type='dense', size=layerSize, activation='selu'), dict(type='dense', size=layerSize, activation='selu') ] self.agent = PPOAgent( states=dict(type='float', shape=(12 + 9, )), actions=actions, batching_capacity=1000, network=network_spec, states_preprocessing=preprocessing_config, actions_exploration=actions_exp, step_optimizer=dict(type='adam', learning_rate=1e-5), ) def act(self, state): jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])), axis=0) #jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0) orient = np.expand_dims(np.array(state["bodyRot"]), axis=0) actiondict = self.agent.act( np.nan_to_num(np.concatenate([jp, orient], axis=1)) / 5.0) #actiondict = self.agent.act(jp) action = np.zeros(12) for i in range(12): action[i] = actiondict[str(i)][0] action = np.nan_to_num(action) #print(action) return np.clip(action, -1.0, 1.0) def observe(self, reward, terminal): self.agent.observe(reward=reward, terminal=terminal) def save(self, directory): self.agent.save_model(directory=directory) def restore(self, directory): self.agent.restore_model(directory=directory)
optimizer=dict(type='adam', learning_rate=1e-3), num_steps=5), gae_lambda=0.97, # PGLRModel likelihood_ratio_clipping=0.2, # PPOAgent step_optimizer=dict(type='adam', learning_rate=1e-3), subsampling_fraction=0.2, optimization_steps=25, execution=dict(type='single', session_config=None, distributed_spec=None)) if ARGS.load: agent.restore_model(directory=ARGS.save_dir) def end(r): return end_of_episode(plotter, r) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=ARGS.epochs, episode_finished=end) runner.close() prune(INPUT_DIR, OUTPUT_DIR, DURATION, TRAFFIC_FILES, ARGS.offset, ARGS.epochs, (algo, conf)) plotter.plot_avgreward( "reward.txt", "avgreward_%s_%s" % (algo, ARGS.epochs + ARGS.offset)) plotter.plot_train_bw('results', '%s_train_bw' % algo, TRAFFIC_FILES, (algo, conf)) plotter.plot_train_bw_iface('results', '%s_env_bw_alt' % algo,
step_optimizer=dict(type='adam', learning_rate=1e-4)) def episode_finished_train(r): print("Trained mother: " + str(r.episode_rewards[-1])) train_reward.append(r.episode_rewards[-1]) plt.plot(train_reward, 'r+') plt.pause(0.01) return True f = open("smaLSTM/checkpoint", "r") lines = f.readlines() train_reward = list() validator_reward = list() for i in range(1, 60): split = lines[i].split() model_path = split[1] print(model_path[1:len(model_path) - 1]) real_model_path = model_path[1:len(model_path) - 1] train_agent.restore_model(directory='smaLSTM', file=real_model_path) train_runner = Runner(agent=train_agent, environment=train_env) train_runner.run(episodes=1, max_episode_timesteps=(candles.candle_nums + 100), episode_finished=episode_finished_train, deterministic=True)
class PPOAgent(Agent): def __init__(self, name, game_inputs=None, callbacks=None, input_shape=None, input_type=None, use_tensorboard=True, tensorforce_kwargs=None): super().__init__(name, game_inputs=game_inputs, callbacks=callbacks) if input_shape is None or not isinstance(input_shape, tuple): raise SerpentError("'input_shape' should be a tuple...") if input_type is None or input_type not in ["bool", "int", "float"]: raise SerpentError( "'input_type' should be one of bool|int|float...") states_spec = {"type": input_type, "shape": input_shape} # TODO: Support multiple actions # TODO: Support continuous action spaces actions_spec = {"type": "int", "num_actions": len(self.game_inputs)} summary_spec = None if use_tensorboard: summary_spec = { "directory": "./tensorboard/", "steps": 50, "labels": [ "configuration", "gradients_scalar", "regularization", "inputs", "losses", "variables" ] } default_network_spec = [{ "type": "conv2d", "size": 32, "window": 8, "stride": 4 }, { "type": "conv2d", "size": 64, "window": 4, "stride": 2 }, { "type": "conv2d", "size": 64, "window": 3, "stride": 1 }, { "type": "flatten" }, { "type": "dense", "size": 1024 }] agent_kwargs = dict(batch_size=1024, batched_observe=1024, network_spec=default_network_spec, device=None, session_config=None, saver_spec=None, distributed_spec=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=0.01, keep_last_timestep=True, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, likelihood_ratio_clipping=None, step_optimizer=None, optimization_steps=10) if isinstance(tensorforce_kwargs, dict): for key, value in tensorforce_kwargs.items(): if key in agent_kwargs: agent_kwargs[key] = value self.agent = TFPPOAgent(states_spec=states_spec, actions_spec=actions_spec, summary_spec=summary_spec, scope="ppo", **agent_kwargs) try: self.restore_model() except Exception: pass def generate_action(self, state, **kwargs): if isinstance(state, GameFrame): self.current_state = state.frame elif isinstance(state, GameFrameBuffer): self.current_state = np.stack( [game_frame.frame for game_frame in state.frames], axis=2) else: self.current_state = state action = self.agent.act(self.current_state) label = self.game_inputs_mapping[action] return label, self.game_inputs[label] def observe(self, reward=0, terminal=False, **kwargs): if self.current_state is None: return None if self.callbacks.get("before_observe") is not None: self.callbacks["before_observe"]() will_update = self.agent.batch_count == self.agent.batch_size - 1 if will_update: if self.callbacks.get("before_update") is not None: self.callbacks["before_update"]() self.agent.observe(reward=reward, terminal=terminal) self.save_model() if self.callbacks.get("after_update") is not None: self.callbacks["after_update"]() else: self.agent.observe(reward=reward, terminal=terminal) self.current_state = None self.current_reward = reward self.cumulative_reward += reward if self.callbacks.get("after_observe") is not None: self.callbacks["after_observe"]() def save_model(self): self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets", self.name, self.name), append_timestep=False) def restore_model(self): self.agent.restore_model( directory=os.path.join(os.getcwd(), "datasets", self.name))
class TensorForcePpoAgent(BaselineAgent): # class TensorForcePpoAgent(BaseAgent): """The TensorForcePpoAgent. Acts through the algorith, not here.""" def __init__(self, character=characters.Bomber, algorithm='ppo', checkpoint='models/checkpoint'): super(TensorForcePpoAgent, self).__init__(character) self.algorithm = algorithm self.checkpoint = checkpoint self.agent = None self.state = {} self.env = None self.version = self.reload_version() print("TensorForcePpoAgent {} iniitialized.".format(self.version)) def reload_version(self, filename='VERSION'): version = None for line in open(filename, 'r'): version = line.strip().split('=')[1] break return version def episode_end(self, reward): # print("i've got rewards {}".format(reward)) pass def act(self, obs, action_space): """This agent has its own way of inducing actions. See train_with_tensorforce.""" print("obs '{}'".format(obs)) agent_state = self.env.featurize(obs) print("featureize '{}'".format(agent_state)) action = self.agent.act(agent_state) return action def initialize(self, env): from gym import spaces from tensorforce.agents import PPOAgent self.env = env # activation function 이 없으므로 depth 가 깊어지면 decay 문제. network_spec = [ dict(type='dense', size=64), dict(type='dense', size=64) ] summarizer = dict( directory="board", steps=50, labels=[ "graph", "losses", "total-loss", "variables", "inputs", "states", "actions", "rewards", "gradients", "gradients_histogram", "gradients_scalar", "regularization" # "configuration" ]) if self.algorithm == "ppo": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_actions': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_actions=env.action_space.n) self.agent = PPOAgent( states=dict(type='float', shape=env.observation_space.shape), actions=actions, network=network_spec, summarizer=summarizer, # Agent states_preprocessing=None, actions_exploration=None, reward_preprocessing=None, # MemoryModel update_mode=dict( unit='episodes', # 100 episodes per update batch_size=100, # Every 10 episodes frequency=10), memory=dict(type='latest', include_next_states=False, capacity=5000), # DistributionModel distributions=None, entropy_regularization=0.01, # PGModel baseline_mode='states', baseline=dict(type='mlp', sizes=[64, 64]), baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-3), num_steps=5), gae_lambda=0.97, # PGLRModel likelihood_ratio_clipping=0.2, # PPOAgent step_optimizer=dict(type='adam', learning_rate=1e-3), subsampling_fraction=0.2, optimization_steps=25, execution=dict(type='single', session_config=None, distributed_spec=None)) # batching_capacity=1000, # step_optimizer=dict(type='adam', learning_rate=1e-4)) self.restore_model_if_exists(self.checkpoint) return self.agent def restore_model_if_exists(self, checkpoint): if os.path.isfile(checkpoint): pardir = os.path.abspath(os.path.join(checkpoint, os.pardir)) self.agent.restore_model(pardir) print("tensorforce model '{}' restored.".format(pardir)) def save_model(self, checkpoint): pardir = os.path.abspath(os.path.join(checkpoint, os.pardir)) if not os.path.exists(pardir): os.mkdir(pardir) print("checkpoint dir '{}' created.".format(pardir)) checkpoint_path = self.agent.save_model(pardir, False) print("checkpoint model '{}' saved.".format(checkpoint_path))
def episode_finished_train(r): print("Trained mother: " + str(r.episode_rewards[-1])) train_reward.append(r.episode_rewards[-1]) plt.plot(train_reward, 'r+') plt.pause(0.01) return True f = open("longlong/checkpoint", "r") lines = f.readlines() train_reward = list(); validator_reward = list(); for i in range(20,len(lines)-1): print(i) split = lines[i].split() model_path = split[1] print(model_path[1:len(model_path)-1]) real_model_path = model_path[1:len(model_path) - 1] print(real_model_path) agent.restore_model(directory='longlong', file = real_model_path) train_runner = Runner(agent=agent, environment=env) train_runner.run(episodes=1, max_episode_timesteps=(candles.candle_nums + 100),episode_finished=episode_finished_train, deterministic=True)
states = env.states, actions = env.actions, network = dense_lstm_net print(states) agent = PPOAgent(states=env.states, actions=env.actions, network=dense_lstm_net, update_mode=dict(unit='episodes', batch_size=30), memory=dict(type='latest', include_next_states=False, capacity=(164 * 30 * 30)), step_optimizer=dict(type='adam', learning_rate=1e-3)) agent.restore_model(directory='traning', file='forex_agent_sma_lstm_15week_train_-1817639') agent.memory = memory = dict(type='latest', include_next_states=False, capacity=(164 * 30 * 50)) # Create the runner runner = Runner(agent=agent, environment=env) lofasz = 0 # Callback function printing episode statistics t = list() rew = list()
def main( mode, # 'train' or 'test' episode=2000, window_size=30, # agent 브레인이 참고할 이전 타임스텝의 길이 init_invest=20000, model_path=None, addition_train=False, selected_learn='dqn', # 'dqn' or 'ppo' selected_trading=[], selected_subject=[], ui_windows=None, # 현재 띄워진 Ui객체 ): global gl_ui_window gl_ui_window = ui_windows set_model_path(model_path if not model_path is None else os.path. join(os.getcwd(), 'model')) if not 'model' in os.listdir(os.getcwd()): os.makedirs('model') # create environment for train and test DATA_PATH = '../daily_data' environment = create_gold_env(window_size=window_size, path=DATA_PATH, train=True if mode == 'train' else False, selected_trading=selected_trading, selected_subject=selected_subject, init_invest=init_invest) network_spec = create_network_spec() baseline_spec = create_baseline_spec() if selected_learn == 'ppo': agent = PPOAgent( discount=0.9999, states=environment.states, actions=environment.actions, network=network_spec, # Agent states_preprocessing=None, actions_exploration=None, reward_preprocessing=None, # MemoryModel update_mode=dict( unit='timesteps', #'episodes', # 10 episodes per update batch_size=32, # # Every 10 episodes frequency=10), memory=dict(type='latest', include_next_states=False, capacity=50000), # DistributionModel distributions=None, entropy_regularization=0.0, # None # PGModel baseline_mode='states', baseline=dict(type='custom', network=baseline_spec), baseline_optimizer=dict( type='multi_step', optimizer=dict( type='adam', learning_rate=(1e-4) # 3e-4 ), num_steps=5), gae_lambda=0, # 0 # PGLRModel likelihood_ratio_clipping=0.2, # PPOAgent step_optimizer=dict( type='adam', learning_rate=(1e-4) # 1e-4 ), subsampling_fraction=0.2, # 0.1 optimization_steps=10, execution=dict(type='single', session_config=None, distributed_spec=None)) else: # learn_model=='dqn' or etc. agent = DQNAgent( states=environment.states, actions=environment.actions, network=[ dict(type='flatten'), dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu'), ], ) if mode == 'test' or addition_train == True: if len( [elem for elem in os.listdir(LOAD_DIR) if 'trading_model' in elem ]) >= 3: agent.restore_model(LOAD_DIR) print('loaded') elif mode == 'test': ui_windows.setInfo(msg="로딩할 트레이딩모델이 존재하지 않는 것으로 보입니다.") return runner = Runner(agent=agent, environment=environment) if mode == 'train': kwargs = dict(episodes=episode, max_episode_timesteps=16000, episode_finished=episode_finished) else: # mode=='test' kwargs = dict(num_episodes=episode, deterministic=True, testing=True, episode_finished=print_simple_log) runner.run(**kwargs) # TODO TFTraderEnv에 에피소드마다의 포트폴리오 결과치 저장해야함. UI에 매순간 데이터 설정하기. # setResult(????) msg = "{mode} finished. Total episodes: {ep}. \nAverage reward of last 100 episodes: {ar}.".format( mode="Training" if mode == 'train' else "Testing", ep=runner.episode, ar=np.mean(runner.episode_rewards[-100:])) print(msg) ui_windows.setInfo(msg=msg)
def main(args): version = 'v1' episodes = args.episodes visualize = args.visualize config = ffa_v0_fast_env() env = Pomme(**config["env_kwargs"]) env.seed(0) agent = PPOAgent( states=dict(type='float', shape=(11, 11, 12)), actions=dict(type='int', num_actions=env.action_space.n), network=[ # (9, 9, 12) dict(type='conv2d', size=12, window=3, stride=1), # (7, 7, 8) dict(type='conv2d', size=8, window=3, stride=1), # (5, 5, 4) dict(type='conv2d', size=4, window=3, stride=1), # (100) dict(type='flatten'), dict(type='dense', size=64, activation='relu'), dict(type='dense', size=16, activation='relu'), ], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) if os.path.exists(os.path.join('models', version, 'checkpoint')): agent.restore_model(directory=os.path.join('models', version)) agents = [] for agent_id in range(3): # agents.append(RandomAgent(config["agent"](agent_id, config["game_type"]))) # agents.append(StoppingAgent(config["agent"](agent_id, config["game_type"]))) agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) agent_id += 1 agents.append( TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) wrapped_env = WrappedEnv(env, agent, visualize) runner = Runner(agent=agent, environment=wrapped_env) try: runner.run(episodes=episodes, max_episode_timesteps=100) except Exception as e: raise e finally: agent.save_model(directory=os.path.join('models', version, 'agent')) win_count = len( list(filter(lambda reward: reward == 1, runner.episode_rewards))) print('Stats: ') print(f' runner.episode_rewards = {runner.episode_rewards}') print(f' win count = {win_count}') try: runner.close() except AttributeError as e: raise e
def main(): env = gym.make('Breakout-v0') # (210, 160, 3) print(env.observation_space.shape) # [[[255...]]] print(env.observation_space.high) # [[[0...]]] print(env.observation_space.low) # 4 print(env.action_space.n) agent = PPOAgent( # (210, 160, 3) states=dict(type='float', shape=env.observation_space.shape), network=[ # (51, 29, 32) dict(type='conv2d', size=32, window=8, stride=4, activation='relu'), # (24, 18, 64) dict(type='conv2d', size=64, window=4, stride=2, activation='relu'), # (22, 16, 64) dict(type='conv2d', size=64, window=3, stride=1, activation='relu'), # 22528 dict(type='flatten'), dict(type='dense', size=512, activation='relu'), dict(type='dense', size=32, activation='relu'), ], # batching_capacity=10, memory=dict( type='latest', include_next_states=False, capacity=1000, ), # update=dict(unit='timesteps', batch_size=64), actions=dict(type='int', num_actions=env.action_space.n), step_optimizer=dict(type='adam', learning_rate=1e-4)) model_dir = 'models/breakout' # load model if os.path.exists(f'{model_dir}/checkpoint'): agent.restore_model(directory=model_dir) try: for step in range(100000): observation = env.reset() done = False step_reward = 0 while not done: # env.render() # from PIL import Image # pil_img = Image.fromarray(observation) # pil_img.save('./observation.png') states = observation / 256 action = agent.act(states=states) observation, reward, done, info = env.step(action) reward = reward / 10 agent.observe(reward=reward, terminal=done) step_reward += reward if done: print(f'step = {step}, reward = {step_reward}') except Exception as e: raise e finally: agent.save_model(directory=f'{model_dir}/agent')
def main(argv): logging_basicConfig(level=INFO) logger = getLogger(__file__) logger.setLevel(INFO) environment = OpenAIGym( gym_id='MoveToBeacon-bbueno5000-v0', monitor=FLAGS.monitor, monitor_safe=FLAGS.monitor_safe, monitor_video=FLAGS.monitor_video, visualize=FLAGS.visualize) # if FLAGS.agent_config is not None: # with open(FLAGS.agent_config, 'r') as fp: # agent_config = json.load(fp=fp) # else: # raise TensorForceError( # "No agent configuration provided.") # if FLAGS.network is not None: # with open(FLAGS.network, 'r') as fp: # network = json.load(fp=fp) # else: # network = None # logger.info( # "No network configuration provided.") network_spec = [ dict(type='flatten'), dict(type='dense', size=32), dict(type='dense', size=32) ] agent = PPOAgent( states=environment.states, actions=environment.actions, network=network_spec ) if FLAGS.load: load_dir = path.dirname(FLAGS.load) if not path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format(load_dir)) agent.restore_model(FLAGS.load) if FLAGS.save: save_dir = path.dirname(FLAGS.save) if not path.isdir(save_dir): try: mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) if FLAGS.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner( agent=agent, environment=environment, repeat_actions=1) if FLAGS.debug: report_episodes = 1 else: report_episodes = 100 logger.info( "Starting {agent} for Environment {env}".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time() - r.start_time) logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format( r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) if FLAGS.save and FLAGS.save_episodes is not None and not r.episode % FLAGS.save_episodes: logger.info("Saving agent to {}".format(FLAGS.save)) r.agent.save_model(FLAGS.save) return True runner.run( num_timesteps=FLAGS.timesteps, num_episodes=FLAGS.num_episodes, max_episode_timesteps=FLAGS.max_episode_timesteps, deterministic=FLAGS.deterministic, episode_finished=episode_finished, testing=FLAGS.test, sleep=FLAGS.sleep) runner.close() logger.info("Learning completed.") logger.info("Total episodes: {ep}".format(ep=runner.agent.episode))
class TensorForceAgent(BaseAgent): """The TensorForceAgent. Acts through the algorith, not here.""" def __init__(self, character=characters.Bomber, algorithm='ppo', checkpoint='models/ppo'): super(TensorForceAgent, self).__init__(character) self.algorithm = algorithm self.checkpoint = checkpoint self.agent = None self.state = {} self.env = None def act(self, obs, action_space): """This agent has its own way of inducing actions. See train_with_tensorforce.""" agent_state = self.env.featurize(obs) action = self.agent.act(agent_state) return action def initialize(self, env): from gym import spaces from tensorforce.agents import PPOAgent self.env = env if self.algorithm == "ppo": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_actions': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_actions=env.action_space.n) self.agent = PPOAgent(states=dict( type='float', shape=env.observation_space.shape), actions=actions, network=[ dict(type='dense', size=64), dict(type='dense', size=64) ], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) self.restore_model_if_exists(self.checkpoint) return self.agent def restore_model_if_exists(self, checkpoint): pardir = os.path.abspath(os.path.join(checkpoint, os.pardir)) if os.path.exists(pardir): self.agent.restore_model(pardir) print("tensorforce model '{}' restored.".format(pardir)) def save_model(self, checkpoint): pardir = os.path.abspath(os.path.join(checkpoint, os.pardir)) if not os.path.exists(pardir): os.mkdir(pardir) print("checkpoint dir '{}' created.".format(pardir)) checkpoint_path = agent.save_model(pardir, False) print("checkpoint model '{}' saved.".format(checkpoint_path))
def episode_finished_train(r): print("Trained mother: " + str(r.episode_rewards[-1])) train_reward.append(r.episode_rewards[-1]) plt.plot(train_reward, 'r+') plt.pause(0.01) return True f = open("forex_models_gradient_2/checkpoint", "r") lines = f.readlines() train_reward = list(); validator_reward = list(); for i in range(1,len(lines)-1): split = lines[i].split() model_path = split[1] print(model_path[1:len(model_path)-1]) real_model_path = model_path[1:len(model_path) - 1] train_agent.restore_model(directory='forex_models_gradient_2', file = real_model_path) train_runner = Runner(agent=train_agent, environment=train_env) train_runner.run(episodes=1, max_episode_timesteps=(candles.candle_nums + 100),episode_finished=episode_finished_train, deterministic=True)