Exemplo n.º 1
0
class TensorforceAgent:
    def __init__(self,actions):
        preprocessing_config = [
            {
                "type": "grayscale"
            }
        ]
        exploration_config = dict(
            type="epsilon_anneal",
            initial_epsilon=0.25,
            final_epsilon=0.01,
            timesteps=1000000
        )

        network_spec = [
            dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'),
            dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'),
            dict(type='flatten'),
            dict(type='dense', size=256, activation='lrelu')
        ]
        self.network_path = "network/"
        self.agent = PPOAgent(
            actions = dict(type='int', num_actions=len(actions)),
            states = dict(type='float', shape=(35, 150, 3)),
            network = network_spec,
            actions_exploration = exploration_config,
            states_preprocessing = preprocessing_config
        )

    def act(self, obs):
        #Cut out only the part needed
        partly = np.delete(obs, np.s_[96:], 0)
        partly = np.delete(partly, np.s_[0:26], 0)
        partly = np.delete(partly, np.s_[35:45], 0)
        partly = np.delete(partly, np.s_[38:53], 0)
        partly = np.delete(partly, np.s_[31:35], 0)
        partly = np.delete(partly, np.s_[10:16], 0)
        frame = np.delete(partly, np.s_[150:], 1)

        #scipy.misc.imsave('outfile.jpg', frame)

        return self.agent.act(frame)

    def load(self):
        import os
        if os.path.isdir(self.network_path):
            try:
                self.agent.restore_model(self.network_path)
            except:
                print("Failed to load model")

    def observe(self, terminal = False, reward = 0):
        return self.agent.observe(terminal, reward)

    def save_model(self):
        import os
        if not os.path.isdir(self.network_path):
            os.makedirs(self.network_path)
        self.agent.save_model(self.network_path)
def main():
    env = gym.make('CartPole-v0')

    # (4,)
    print(env.observation_space.shape)
    # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
    print(env.observation_space.high)
    # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
    print(env.observation_space.low)
    # 2
    print(env.action_space.n)

    agent = PPOAgent(
        states=dict(type='float', shape=env.observation_space.shape),
        network=[
            dict(type='dense', size=32, activation='relu'),
            dict(type='dense', size=32, activation='relu'),
        ],
        actions=dict(type='int', num_actions=env.action_space.n),
        step_optimizer=dict(type='adam', learning_rate=1e-4)
    )

    model_dir = 'models/cartpole'

    if os.path.exists(f'{model_dir}/checkpoint'):
        agent.restore_model(directory=model_dir)

    try:
        for ep in range(2000):
            observation = env.reset()
            done = False
            ep_reward = 0
            while not done:
                # env.render()

                states = observation / 4

                action = agent.act(states=states)

                observation, reward, done, info = env.step(action)

                agent.observe(reward=reward, terminal=done)

                ep_reward += reward

                if done:
                    print(f'ep = {ep}, ep_reward = {ep_reward}')
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=f'{model_dir}/agent')
Exemplo n.º 3
0
class ForwardActor:
    def __init__(self):

        actions = {}
        for i in range(12):
            actions[str(i)] = {'type': 'float'}  # 'num_actions': 10

        network_spec = [
            dict(type='dense', size=100, activation='relu'),
            dict(type='dense', size=100, activation='relu')
        ]

        self.agent = PPOAgent(
            states=dict(type='float', shape=(12, )),
            actions=actions,
            batching_capacity=2000,
            network=network_spec,
            step_optimizer=dict(type='adam', learning_rate=1e-4),
        )

    def act(self, state):
        jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])),
                            axis=0)
        jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0)

        #actiondict = self.agent.act( np.concatenate([jp,jv],axis=1))
        actiondict = self.agent.act(jp)

        action = np.zeros(12)
        for i in range(12):
            action[i] = actiondict[str(i)][0]
        action = np.nan_to_num(action)
        #print(action)
        return np.clip(action, -1.0, 1.0)

    def observe(self, reward, terminal):
        self.agent.observe(reward=reward, terminal=terminal)

    def save(self, directory):
        self.agent.save_model(directory=directory)

    def restore(self, directory):
        self.agent.restore_model(directory=directory)
Exemplo n.º 4
0
    # PPOAgent
    likelihood_ratio_clipping=0.2,
    step_optimizer=dict(
        type='adam',
        learning_rate=2.5 * 1e-3
    ),
    subsampling_fraction=0.0625,
    optimization_steps=50,
    execution=dict(
        type='single',
        session_config=None,
        distributed_spec=None
    )
)

agent.restore_model(directory='./load/')
print('Model restored')
#print(agent.act(states=np.zeros(environment.states['shape']), deterministic=False, buffered=False, independent = True))

# TODO : Create my own runner with multithreading
# Create the runner
runner = Runner(agent=agent, environment=environment)

# Load latest checkpoint
if RENDER:
    input('Ready to start.')
    runner.run(episodes=1, max_episode_timesteps=20, deterministic = False)
    print('Score achieved : {}'.format(np.array(runner.episode_rewards[-1]) / np.array(runner.episode_timesteps[-1])))

runner.close()
    while True:

        if game_name == None:
            print(
                "You're starting the training with Unity Editor. You can test the correct interactions between "
                "Unity and Tensorforce, but for a complete training you must start it with a built environment."
            )

        # Close the environment
        if environment != None:
            environment.close()

        # If model name is not None, restore the parameters
        if use_model == 'y':
            directory = os.path.join(os.getcwd(), "saved/")
            agent.restore_model(directory, model_name)

        # Open the environment with all the desired flags
        environment = UnityEnvWrapper(game_name,
                                      no_graphics=True,
                                      seed=int(time.time()),
                                      worker_id=work_id,
                                      with_stats=args.with_stats,
                                      size_stats=11,
                                      size_global=10,
                                      agent_separate=False,
                                      with_class=False,
                                      with_hp=False,
                                      with_previous=lstm,
                                      verbose=False,
                                      manual_input=False)
    optimization_steps=25,
    execution=dict(
        type='single',
        session_config=None,
        distributed_spec=None
    )
)

restore_path = None
if(os.path.exists("saved_models/checkpoint")):
    restore_path = './saved_models'


if restore_path is not None:
    printi("restore the model")
    agent.restore_model(restore_path)
else :
    print('Trained Network not found...')

if(os.path.exists("saved_models/test_strategy.csv")):
    os.remove("saved_models/test_strategy.csv")

if(os.path.exists("saved_models/test_strategy_avg.csv")):
    os.remove("saved_models/test_strategy_avg.csv")

def one_run():

    printi("start simulation")
    state = environment.reset()
    environment.render = True
    null_action = np.zeros(environment.actions['shape'])
Exemplo n.º 7
0
]

states = env.states,
actions = env.actions,
network = dense_lstm_net

agent = PPOAgent(states=env.states,
                 actions=env.actions,
                 network=dense_lstm_net,
                 update_mode=dict(unit='episodes', batch_size=35),
                 memory=dict(type='latest',
                             include_next_states=False,
                             capacity=(164 * 35 * 54 * 4)),
                 step_optimizer=dict(type='adam', learning_rate=1e-4))

agent.restore_model(directory='smaLSTM')

# Create the runner
runner = Runner(agent=agent, environment=env)

lofasz = 0

# Callback function printing episode statistics

t = list()
rew = list()

modelSaves = 1


def episode_finished(r):
Exemplo n.º 8
0
class SerpentPPO:

    def __init__(self, frame_shape=None, game_inputs=None):

        if frame_shape is None:
            raise SerpentError("A 'frame_shape' tuple kwarg is required...")

        states_spec = {"type": "float", "shape": frame_shape}

        if game_inputs is None:
            raise SerpentError("A 'game_inputs' dict kwarg is required...")

        self.game_inputs = game_inputs
        self.game_inputs_mapping = self._generate_game_inputs_mapping()

        actions_spec = {"type": "int", "num_actions": len(self.game_inputs)}

        network_spec = [
            {"type": "conv2d", "size": 1, "window": 2, "stride": 1},
            {"type": "flatten"},
            # {"type": "dense", "size": 64},
            {"type": "dense", "size": 6}
        ]

        self.agent = PPOAgent(
            states=states_spec,
            actions=actions_spec,
            network=network_spec,

            batched_observe=256,
            batching_capacity=1000,
            # BatchAgent
            #keep_last_timestep=True,
            # PPOAgent
            step_optimizer=dict(
                type='adam',
                learning_rate=1e-4
            ),
            optimization_steps=10,
            # Model
            scope='ppo'
                #discount=0.97,
            # DistributionModel
                #distributions=None,
                #entropy_regularization=0.01,
            # PGModel
                #baseline_mode=None,
                #baseline=None,
                #baseline_optimizer=None,
                #gae_lambda=None,
            # PGLRModel
                #likelihood_ratio_clipping=None,
            #summary_spec=summary_spec,
            #distributed_spec=None,
            # More info
                #device=None,
            #session_config=None,
                #saver=None,
                #variable_noise=None,
            #states_preprocessing_spec=None,
            #explorations_spec=None,
            #reward_preprocessing_spec=None,
                #execution=None,
                #actions_exploration=None,
                #update_mode=None,
                #memory=None,
                #subsampling_fraction=0.1
        )

    def generate_action(self, game_frame_buffer):
        states = np.stack(
            game_frame_buffer,
            axis=2
        )

        # Get prediction from agent, execute
        action = self.agent.act(states)
        label = self.game_inputs_mapping[action]

        return action, label, self.game_inputs[label]

    def observe(self, reward=0, terminal=False):
        self.agent.observe(reward=reward, terminal=terminal)

    def _generate_game_inputs_mapping(self):
        mapping = dict()

        for index, key in enumerate(self.game_inputs):
            mapping[index] = key

        return mapping

    def save_model(self):
        self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman", "ppo_model"), append_timestep=False)

    def restore_model(self):
        self.agent.restore_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman"))
Exemplo n.º 9
0
    #distributions_spec=None,
    entropy_regularization=0.01,
    # PGModel
    baseline_mode=None,
    baseline=None,
    baseline_optimizer=None,
    gae_lambda=None,
    # PGLRModel
    likelihood_ratio_clipping=0.2,
    #summary_spec=None,
    #distributed_spec=None
)
path = os.getcwd()
print(path)
try:
    agent.restore_model(path)
except:
    pass

# Create the runner
#runner = ThreadedRunnerMod(agent=agent, environment=env,  save_frequency=100, save_frequency_unit='e')
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):
    print(
        "Finished episode {ep} after {ts} timesteps ({d} days) (reward: {reward})"
        .format(ep=r.episode,
                ts=r.episode_timestep,
                d=int(r.episode_timestep / 24),
Exemplo n.º 10
0
    network=[
        dict(type='dense', size=256),
        dict(type='dense', size=256),
        dict(type='dense', size=256)
    ],
    update_mode=dict(unit='episodes', batch_size=10),
    # PGModel
    baseline_mode='states',
    baseline=dict(type='mlp', sizes=[256, 256, 256]),
    baseline_optimizer=dict(type='multi_step',
                            optimizer=dict(type='adam', learning_rate=1e-3),
                            num_steps=5),
    gae_lambda=0.97,
    step_optimizer=dict(type='adam', learning_rate=1e-4))

agent.restore_model('./models', 'net-297000-0.57-5126573')

runner = Runner(agent=agent, environment=environment)

start_time = time.perf_counter()


def episode_finished(r):
    if r.episode % 100 == 0:
        sps = r.timestep / (time.time() - r.start_time)
        logger.info(
            "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}"
            .format(ep=r.episode, ts=r.timestep, sps=sps))
        logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
        logger.info("Episode timesteps: {}".format(r.episode_timestep))
        #logger.info("Episode largest tile: {}".format(r.environment.largest_tile))
Exemplo n.º 11
0
                                           momentum=0.9),
                            num_steps=10),
    gae_lambda=0.95,

    # PPOAgent
    likelihood_ratio_clipping=0.2,
    step_optimizer=dict(type='momentum',
                        learning_rate=learning_rate,
                        momentum=0.9),
    subsampling_fraction=0.0625,
    discount=0.95,
    optimization_steps=50,
    execution=dict(type='single', session_config=None, distributed_spec=None))
#print('Agent created')
# Synchronize with master agent
agent.restore_model(directory=save_path)
#print('Agent restored')

# Synchronize
comm.Barrier()
#print('Yo')

episode = 0
data_buffer = []
if process_id == 0:
    pbar = tqdm.tqdm(total=nprocs * batch_allocation)
# Run this single worker (episode loop) as long as episode threshold have not been reached.
while not should_stop:
    state = env.reset()
    #print('Calling reset')
    agent.reset()
    ],
    batching_capacity=4096,
    step_optimizer=dict(type='adam', learning_rate=1e-3),
    optimization_steps=10,
    scope='ppo',
    discount=0.99,
    entropy_regularization=0.01,
    baseline_mode=None,
    baseline=None,
    baseline_optimizer=None,
    gae_lambda=None,
    likelihood_ratio_clipping=0.2,
)

if "--resume" in sys.argv:
    agent.restore_model(directory="models/")

runner = Runner(agent=agent, environment=env)


def episode_finished(r):
    print("[{ep}] @ {ts}ts -> \t{reward}".format(ep=r.episode,
                                                 ts=r.episode_timestep,
                                                 reward=r.episode_rewards[-1]))
    training_progress.append(r.episode_rewards[-1])
    if r.episode % 100 == 0:
        env.visualize = True
        agent.save_model(directory="models/")
        plt.scatter(range(len(training_progress)), training_progress, s=1)
        plt.title("Cart Pole Training Progress\n3-layer 10-neurons/layer ReLU")
        plt.xlabel("Episodes")
Exemplo n.º 13
0
class ForwardActorSimple:
    def __init__(self):

        actions = {}
        actions_exp = {}
        for i in range(12):
            actions[str(i)] = {'type': 'float'}  # 'num_actions': 10
            actions_exp[str(i)] = dict(type='ornstein_uhlenbeck',
                                       sigma=0.1,
                                       mu=0.0,
                                       theta=0.1)

        preprocessing_config = [{"type": "standardize"}]

        preprocessing_config = None

        customnet = dict(type=CustomNetwork)
        layerSize = 300
        network_spec = [
            dict(type='dense', size=100),
            dict(type='lstm', size=100)
        ]
        '''
        network_spec = [
                            dict(type='dense', size=100),
                           dict(type='internal_lstm', size=100)
                       ]
       
        '''

        network_spec = [
            dict(type='dense', size=layerSize, activation='selu'),
            dict(type='dense', size=layerSize, activation='selu'),
            dict(type='dense', size=layerSize, activation='selu')
        ]

        self.agent = PPOAgent(
            states=dict(type='float', shape=(12 + 9, )),
            actions=actions,
            batching_capacity=1000,
            network=network_spec,
            states_preprocessing=preprocessing_config,
            actions_exploration=actions_exp,
            step_optimizer=dict(type='adam', learning_rate=1e-5),
        )

    def act(self, state):
        jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])),
                            axis=0)
        #jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0)
        orient = np.expand_dims(np.array(state["bodyRot"]), axis=0)
        actiondict = self.agent.act(
            np.nan_to_num(np.concatenate([jp, orient], axis=1)) / 5.0)
        #actiondict = self.agent.act(jp)

        action = np.zeros(12)
        for i in range(12):
            action[i] = actiondict[str(i)][0]
        action = np.nan_to_num(action)
        #print(action)
        return np.clip(action, -1.0, 1.0)

    def observe(self, reward, terminal):
        self.agent.observe(reward=reward, terminal=terminal)

    def save(self, directory):
        self.agent.save_model(directory=directory)

    def restore(self, directory):
        self.agent.restore_model(directory=directory)
Exemplo n.º 14
0
                                        optimizer=dict(type='adam',
                                                       learning_rate=1e-3),
                                        num_steps=5),
                gae_lambda=0.97,
                # PGLRModel
                likelihood_ratio_clipping=0.2,
                # PPOAgent
                step_optimizer=dict(type='adam', learning_rate=1e-3),
                subsampling_fraction=0.2,
                optimization_steps=25,
                execution=dict(type='single',
                               session_config=None,
                               distributed_spec=None))

            if ARGS.load:
                agent.restore_model(directory=ARGS.save_dir)

            def end(r):
                return end_of_episode(plotter, r)

            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=ARGS.epochs, episode_finished=end)
            runner.close()
            prune(INPUT_DIR, OUTPUT_DIR, DURATION, TRAFFIC_FILES, ARGS.offset,
                  ARGS.epochs, (algo, conf))
        plotter.plot_avgreward(
            "reward.txt",
            "avgreward_%s_%s" % (algo, ARGS.epochs + ARGS.offset))
        plotter.plot_train_bw('results', '%s_train_bw' % algo, TRAFFIC_FILES,
                              (algo, conf))
        plotter.plot_train_bw_iface('results', '%s_env_bw_alt' % algo,
Exemplo n.º 15
0
                       step_optimizer=dict(type='adam', learning_rate=1e-4))


def episode_finished_train(r):
    print("Trained mother: " + str(r.episode_rewards[-1]))

    train_reward.append(r.episode_rewards[-1])
    plt.plot(train_reward, 'r+')
    plt.pause(0.01)
    return True


f = open("smaLSTM/checkpoint", "r")

lines = f.readlines()

train_reward = list()
validator_reward = list()

for i in range(1, 60):
    split = lines[i].split()
    model_path = split[1]
    print(model_path[1:len(model_path) - 1])
    real_model_path = model_path[1:len(model_path) - 1]
    train_agent.restore_model(directory='smaLSTM', file=real_model_path)
    train_runner = Runner(agent=train_agent, environment=train_env)
    train_runner.run(episodes=1,
                     max_episode_timesteps=(candles.candle_nums + 100),
                     episode_finished=episode_finished_train,
                     deterministic=True)
Exemplo n.º 16
0
class PPOAgent(Agent):
    def __init__(self,
                 name,
                 game_inputs=None,
                 callbacks=None,
                 input_shape=None,
                 input_type=None,
                 use_tensorboard=True,
                 tensorforce_kwargs=None):
        super().__init__(name, game_inputs=game_inputs, callbacks=callbacks)

        if input_shape is None or not isinstance(input_shape, tuple):
            raise SerpentError("'input_shape' should be a tuple...")

        if input_type is None or input_type not in ["bool", "int", "float"]:
            raise SerpentError(
                "'input_type' should be one of bool|int|float...")

        states_spec = {"type": input_type, "shape": input_shape}

        # TODO: Support multiple actions
        # TODO: Support continuous action spaces
        actions_spec = {"type": "int", "num_actions": len(self.game_inputs)}

        summary_spec = None

        if use_tensorboard:
            summary_spec = {
                "directory":
                "./tensorboard/",
                "steps":
                50,
                "labels": [
                    "configuration", "gradients_scalar", "regularization",
                    "inputs", "losses", "variables"
                ]
            }

        default_network_spec = [{
            "type": "conv2d",
            "size": 32,
            "window": 8,
            "stride": 4
        }, {
            "type": "conv2d",
            "size": 64,
            "window": 4,
            "stride": 2
        }, {
            "type": "conv2d",
            "size": 64,
            "window": 3,
            "stride": 1
        }, {
            "type": "flatten"
        }, {
            "type": "dense",
            "size": 1024
        }]

        agent_kwargs = dict(batch_size=1024,
                            batched_observe=1024,
                            network_spec=default_network_spec,
                            device=None,
                            session_config=None,
                            saver_spec=None,
                            distributed_spec=None,
                            discount=0.99,
                            variable_noise=None,
                            states_preprocessing_spec=None,
                            explorations_spec=None,
                            reward_preprocessing_spec=None,
                            distributions_spec=None,
                            entropy_regularization=0.01,
                            keep_last_timestep=True,
                            baseline_mode=None,
                            baseline=None,
                            baseline_optimizer=None,
                            gae_lambda=None,
                            likelihood_ratio_clipping=None,
                            step_optimizer=None,
                            optimization_steps=10)

        if isinstance(tensorforce_kwargs, dict):
            for key, value in tensorforce_kwargs.items():
                if key in agent_kwargs:
                    agent_kwargs[key] = value

        self.agent = TFPPOAgent(states_spec=states_spec,
                                actions_spec=actions_spec,
                                summary_spec=summary_spec,
                                scope="ppo",
                                **agent_kwargs)

        try:
            self.restore_model()
        except Exception:
            pass

    def generate_action(self, state, **kwargs):
        if isinstance(state, GameFrame):
            self.current_state = state.frame
        elif isinstance(state, GameFrameBuffer):
            self.current_state = np.stack(
                [game_frame.frame for game_frame in state.frames], axis=2)
        else:
            self.current_state = state

        action = self.agent.act(self.current_state)
        label = self.game_inputs_mapping[action]

        return label, self.game_inputs[label]

    def observe(self, reward=0, terminal=False, **kwargs):
        if self.current_state is None:
            return None

        if self.callbacks.get("before_observe") is not None:
            self.callbacks["before_observe"]()

        will_update = self.agent.batch_count == self.agent.batch_size - 1

        if will_update:
            if self.callbacks.get("before_update") is not None:
                self.callbacks["before_update"]()

            self.agent.observe(reward=reward, terminal=terminal)
            self.save_model()

            if self.callbacks.get("after_update") is not None:
                self.callbacks["after_update"]()
        else:
            self.agent.observe(reward=reward, terminal=terminal)

        self.current_state = None

        self.current_reward = reward
        self.cumulative_reward += reward

        if self.callbacks.get("after_observe") is not None:
            self.callbacks["after_observe"]()

    def save_model(self):
        self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets",
                                                     self.name, self.name),
                              append_timestep=False)

    def restore_model(self):
        self.agent.restore_model(
            directory=os.path.join(os.getcwd(), "datasets", self.name))
Exemplo n.º 17
0
class TensorForcePpoAgent(BaselineAgent):
    # class TensorForcePpoAgent(BaseAgent):
    """The TensorForcePpoAgent. Acts through the algorith, not here."""
    def __init__(self,
                 character=characters.Bomber,
                 algorithm='ppo',
                 checkpoint='models/checkpoint'):
        super(TensorForcePpoAgent, self).__init__(character)
        self.algorithm = algorithm
        self.checkpoint = checkpoint
        self.agent = None
        self.state = {}
        self.env = None
        self.version = self.reload_version()
        print("TensorForcePpoAgent {} iniitialized.".format(self.version))

    def reload_version(self, filename='VERSION'):
        version = None
        for line in open(filename, 'r'):
            version = line.strip().split('=')[1]
            break
        return version

    def episode_end(self, reward):
        # print("i've got rewards {}".format(reward))
        pass

    def act(self, obs, action_space):
        """This agent has its own way of inducing actions. See train_with_tensorforce."""
        print("obs '{}'".format(obs))
        agent_state = self.env.featurize(obs)
        print("featureize '{}'".format(agent_state))
        action = self.agent.act(agent_state)
        return action

    def initialize(self, env):
        from gym import spaces
        from tensorforce.agents import PPOAgent
        self.env = env

        # activation function 이 없으므로 depth 가 깊어지면 decay 문제.
        network_spec = [
            dict(type='dense', size=64),
            dict(type='dense', size=64)
        ]

        summarizer = dict(
            directory="board",
            steps=50,
            labels=[
                "graph", "losses", "total-loss", "variables", "inputs",
                "states", "actions", "rewards", "gradients",
                "gradients_histogram", "gradients_scalar", "regularization"
                # "configuration"
            ])

        if self.algorithm == "ppo":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_actions': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_actions=env.action_space.n)

            self.agent = PPOAgent(
                states=dict(type='float', shape=env.observation_space.shape),
                actions=actions,
                network=network_spec,
                summarizer=summarizer,
                # Agent
                states_preprocessing=None,
                actions_exploration=None,
                reward_preprocessing=None,
                # MemoryModel
                update_mode=dict(
                    unit='episodes',
                    # 100 episodes per update
                    batch_size=100,
                    # Every 10 episodes
                    frequency=10),
                memory=dict(type='latest',
                            include_next_states=False,
                            capacity=5000),
                # DistributionModel
                distributions=None,
                entropy_regularization=0.01,
                # PGModel
                baseline_mode='states',
                baseline=dict(type='mlp', sizes=[64, 64]),
                baseline_optimizer=dict(type='multi_step',
                                        optimizer=dict(type='adam',
                                                       learning_rate=1e-3),
                                        num_steps=5),
                gae_lambda=0.97,
                # PGLRModel
                likelihood_ratio_clipping=0.2,
                # PPOAgent
                step_optimizer=dict(type='adam', learning_rate=1e-3),
                subsampling_fraction=0.2,
                optimization_steps=25,
                execution=dict(type='single',
                               session_config=None,
                               distributed_spec=None))
            # batching_capacity=1000,
            # step_optimizer=dict(type='adam', learning_rate=1e-4))

            self.restore_model_if_exists(self.checkpoint)

        return self.agent

    def restore_model_if_exists(self, checkpoint):
        if os.path.isfile(checkpoint):
            pardir = os.path.abspath(os.path.join(checkpoint, os.pardir))
            self.agent.restore_model(pardir)
            print("tensorforce model '{}' restored.".format(pardir))

    def save_model(self, checkpoint):
        pardir = os.path.abspath(os.path.join(checkpoint, os.pardir))
        if not os.path.exists(pardir):
            os.mkdir(pardir)
            print("checkpoint dir '{}' created.".format(pardir))
        checkpoint_path = self.agent.save_model(pardir, False)
        print("checkpoint model '{}' saved.".format(checkpoint_path))
Exemplo n.º 18
0
def episode_finished_train(r):
    print("Trained mother: " + str(r.episode_rewards[-1]))

    train_reward.append(r.episode_rewards[-1])
    plt.plot(train_reward, 'r+')
    plt.pause(0.01)
    return True


f = open("longlong/checkpoint", "r")

lines = f.readlines()

train_reward = list();
validator_reward = list();


for i in range(20,len(lines)-1):
    print(i)
    split = lines[i].split()
    model_path = split[1]
    print(model_path[1:len(model_path)-1])
    real_model_path = model_path[1:len(model_path) - 1]
    print(real_model_path)
    agent.restore_model(directory='longlong', file = real_model_path)
    train_runner = Runner(agent=agent, environment=env)
    train_runner.run(episodes=1, max_episode_timesteps=(candles.candle_nums + 100),episode_finished=episode_finished_train, deterministic=True)



Exemplo n.º 19
0
states = env.states,
actions = env.actions,
network = dense_lstm_net

print(states)

agent = PPOAgent(states=env.states,
                 actions=env.actions,
                 network=dense_lstm_net,
                 update_mode=dict(unit='episodes', batch_size=30),
                 memory=dict(type='latest',
                             include_next_states=False,
                             capacity=(164 * 30 * 30)),
                 step_optimizer=dict(type='adam', learning_rate=1e-3))

agent.restore_model(directory='traning',
                    file='forex_agent_sma_lstm_15week_train_-1817639')

agent.memory = memory = dict(type='latest',
                             include_next_states=False,
                             capacity=(164 * 30 * 50))

# Create the runner
runner = Runner(agent=agent, environment=env)

lofasz = 0

# Callback function printing episode statistics

t = list()
rew = list()
Exemplo n.º 20
0
def main(
        mode,  # 'train'  or 'test'
        episode=2000,
        window_size=30,  # agent 브레인이 참고할 이전 타임스텝의 길이
        init_invest=20000,
        model_path=None,
        addition_train=False,
        selected_learn='dqn',  # 'dqn' or 'ppo'
        selected_trading=[],
        selected_subject=[],
        ui_windows=None,  # 현재 띄워진 Ui객체
):
    global gl_ui_window
    gl_ui_window = ui_windows

    set_model_path(model_path if not model_path is None else os.path.
                   join(os.getcwd(), 'model'))
    if not 'model' in os.listdir(os.getcwd()):
        os.makedirs('model')

    # create environment for train and test
    DATA_PATH = '../daily_data'
    environment = create_gold_env(window_size=window_size,
                                  path=DATA_PATH,
                                  train=True if mode == 'train' else False,
                                  selected_trading=selected_trading,
                                  selected_subject=selected_subject,
                                  init_invest=init_invest)

    network_spec = create_network_spec()
    baseline_spec = create_baseline_spec()

    if selected_learn == 'ppo':
        agent = PPOAgent(
            discount=0.9999,
            states=environment.states,
            actions=environment.actions,
            network=network_spec,
            # Agent
            states_preprocessing=None,
            actions_exploration=None,
            reward_preprocessing=None,
            # MemoryModel
            update_mode=dict(
                unit='timesteps',  #'episodes',
                # 10 episodes per update
                batch_size=32,
                # # Every 10 episodes
                frequency=10),
            memory=dict(type='latest',
                        include_next_states=False,
                        capacity=50000),
            # DistributionModel
            distributions=None,
            entropy_regularization=0.0,  # None
            # PGModel
            baseline_mode='states',
            baseline=dict(type='custom', network=baseline_spec),
            baseline_optimizer=dict(
                type='multi_step',
                optimizer=dict(
                    type='adam',
                    learning_rate=(1e-4)  # 3e-4
                ),
                num_steps=5),
            gae_lambda=0,  # 0
            # PGLRModel
            likelihood_ratio_clipping=0.2,
            # PPOAgent
            step_optimizer=dict(
                type='adam',
                learning_rate=(1e-4)  # 1e-4
            ),
            subsampling_fraction=0.2,  # 0.1
            optimization_steps=10,
            execution=dict(type='single',
                           session_config=None,
                           distributed_spec=None))
    else:  # learn_model=='dqn' or etc.
        agent = DQNAgent(
            states=environment.states,
            actions=environment.actions,
            network=[
                dict(type='flatten'),
                dict(type='dense', size=32, activation='relu'),
                dict(type='dense', size=32, activation='relu'),
            ],
        )

    if mode == 'test' or addition_train == True:
        if len(
            [elem for elem in os.listdir(LOAD_DIR) if 'trading_model' in elem
             ]) >= 3:
            agent.restore_model(LOAD_DIR)
            print('loaded')
        elif mode == 'test':
            ui_windows.setInfo(msg="로딩할 트레이딩모델이 존재하지 않는 것으로 보입니다.")
            return

    runner = Runner(agent=agent, environment=environment)
    if mode == 'train':
        kwargs = dict(episodes=episode,
                      max_episode_timesteps=16000,
                      episode_finished=episode_finished)
    else:  # mode=='test'
        kwargs = dict(num_episodes=episode,
                      deterministic=True,
                      testing=True,
                      episode_finished=print_simple_log)
    runner.run(**kwargs)

    # TODO TFTraderEnv에 에피소드마다의 포트폴리오 결과치 저장해야함. UI에 매순간 데이터 설정하기.
    # setResult(????)
    msg = "{mode} finished. Total episodes: {ep}. \nAverage reward of last 100 episodes: {ar}.".format(
        mode="Training" if mode == 'train' else "Testing",
        ep=runner.episode,
        ar=np.mean(runner.episode_rewards[-100:]))
    print(msg)
    ui_windows.setInfo(msg=msg)
Exemplo n.º 21
0
def main(args):
    version = 'v1'
    episodes = args.episodes
    visualize = args.visualize

    config = ffa_v0_fast_env()
    env = Pomme(**config["env_kwargs"])
    env.seed(0)

    agent = PPOAgent(
        states=dict(type='float', shape=(11, 11, 12)),
        actions=dict(type='int', num_actions=env.action_space.n),
        network=[
            # (9, 9, 12)
            dict(type='conv2d', size=12, window=3, stride=1),
            # (7, 7, 8)
            dict(type='conv2d', size=8, window=3, stride=1),
            # (5, 5, 4)
            dict(type='conv2d', size=4, window=3, stride=1),
            # (100)
            dict(type='flatten'),
            dict(type='dense', size=64, activation='relu'),
            dict(type='dense', size=16, activation='relu'),
        ],
        batching_capacity=1000,
        step_optimizer=dict(type='adam', learning_rate=1e-4))

    if os.path.exists(os.path.join('models', version, 'checkpoint')):
        agent.restore_model(directory=os.path.join('models', version))

    agents = []
    for agent_id in range(3):
        # agents.append(RandomAgent(config["agent"](agent_id, config["game_type"])))
        # agents.append(StoppingAgent(config["agent"](agent_id, config["game_type"])))
        agents.append(
            SimpleAgent(config["agent"](agent_id, config["game_type"])))

    agent_id += 1
    agents.append(
        TensorforceAgent(config["agent"](agent_id, config["game_type"])))
    env.set_agents(agents)
    env.set_training_agent(agents[-1].agent_id)
    env.set_init_game_state(None)

    wrapped_env = WrappedEnv(env, agent, visualize)
    runner = Runner(agent=agent, environment=wrapped_env)

    try:
        runner.run(episodes=episodes, max_episode_timesteps=100)
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=os.path.join('models', version, 'agent'))

    win_count = len(
        list(filter(lambda reward: reward == 1, runner.episode_rewards)))
    print('Stats: ')
    print(f'  runner.episode_rewards = {runner.episode_rewards}')
    print(f'  win count = {win_count}')

    try:
        runner.close()
    except AttributeError as e:
        raise e
def main():
    env = gym.make('Breakout-v0')

    # (210, 160, 3)
    print(env.observation_space.shape)
    # [[[255...]]]
    print(env.observation_space.high)
    # [[[0...]]]
    print(env.observation_space.low)
    # 4
    print(env.action_space.n)

    agent = PPOAgent(
        # (210, 160, 3)
        states=dict(type='float', shape=env.observation_space.shape),
        network=[
            # (51, 29, 32)
            dict(type='conv2d', size=32, window=8, stride=4,
                 activation='relu'),
            # (24, 18, 64)
            dict(type='conv2d', size=64, window=4, stride=2,
                 activation='relu'),
            # (22, 16, 64)
            dict(type='conv2d', size=64, window=3, stride=1,
                 activation='relu'),
            # 22528
            dict(type='flatten'),
            dict(type='dense', size=512, activation='relu'),
            dict(type='dense', size=32, activation='relu'),
        ],
        # batching_capacity=10,
        memory=dict(
            type='latest',
            include_next_states=False,
            capacity=1000,
        ),
        # update=dict(unit='timesteps', batch_size=64),
        actions=dict(type='int', num_actions=env.action_space.n),
        step_optimizer=dict(type='adam', learning_rate=1e-4))

    model_dir = 'models/breakout'

    # load model
    if os.path.exists(f'{model_dir}/checkpoint'):
        agent.restore_model(directory=model_dir)

    try:
        for step in range(100000):
            observation = env.reset()

            done = False
            step_reward = 0
            while not done:
                # env.render()

                # from PIL import Image
                # pil_img = Image.fromarray(observation)
                # pil_img.save('./observation.png')

                states = observation / 256

                action = agent.act(states=states)

                observation, reward, done, info = env.step(action)

                reward = reward / 10

                agent.observe(reward=reward, terminal=done)

                step_reward += reward

                if done:
                    print(f'step = {step}, reward = {step_reward}')
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=f'{model_dir}/agent')
Exemplo n.º 23
0
def main(argv):
    logging_basicConfig(level=INFO)
    logger = getLogger(__file__)
    logger.setLevel(INFO)

    environment = OpenAIGym(
        gym_id='MoveToBeacon-bbueno5000-v0',
        monitor=FLAGS.monitor,
        monitor_safe=FLAGS.monitor_safe,
        monitor_video=FLAGS.monitor_video,
        visualize=FLAGS.visualize)

    # if FLAGS.agent_config is not None:
    #     with open(FLAGS.agent_config, 'r') as fp:
    #         agent_config = json.load(fp=fp)
    # else:
    #     raise TensorForceError(
    #         "No agent configuration provided.")

    # if FLAGS.network is not None:
    #     with open(FLAGS.network, 'r') as fp:
    #         network = json.load(fp=fp)
    # else:
    #     network = None
    #     logger.info(
    #         "No network configuration provided.")

    network_spec = [
        dict(type='flatten'),
        dict(type='dense', size=32),
        dict(type='dense', size=32)
        ]

    agent = PPOAgent(
        states=environment.states,
        actions=environment.actions,
        network=network_spec
        )

    if FLAGS.load:
        load_dir = path.dirname(FLAGS.load)
        if not path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(load_dir))
        agent.restore_model(FLAGS.load)

    if FLAGS.save:
        save_dir = path.dirname(FLAGS.save)
        if not path.isdir(save_dir):
            try:
                mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    if FLAGS.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1)

    if FLAGS.debug:
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info(
        "Starting {agent} for Environment {env}".format(
            agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time() - r.start_time)
            logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format(
                r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards))))
        if FLAGS.save and FLAGS.save_episodes is not None and not r.episode % FLAGS.save_episodes:
            logger.info("Saving agent to {}".format(FLAGS.save))
            r.agent.save_model(FLAGS.save)
        return True

    runner.run(
        num_timesteps=FLAGS.timesteps,
        num_episodes=FLAGS.num_episodes,
        max_episode_timesteps=FLAGS.max_episode_timesteps,
        deterministic=FLAGS.deterministic,
        episode_finished=episode_finished,
        testing=FLAGS.test,
        sleep=FLAGS.sleep)

    runner.close()

    logger.info("Learning completed.")
    logger.info("Total episodes: {ep}".format(ep=runner.agent.episode))
Exemplo n.º 24
0
class TensorForceAgent(BaseAgent):
    """The TensorForceAgent. Acts through the algorith, not here."""
    def __init__(self,
                 character=characters.Bomber,
                 algorithm='ppo',
                 checkpoint='models/ppo'):
        super(TensorForceAgent, self).__init__(character)
        self.algorithm = algorithm
        self.checkpoint = checkpoint
        self.agent = None
        self.state = {}
        self.env = None

    def act(self, obs, action_space):
        """This agent has its own way of inducing actions. See train_with_tensorforce."""
        agent_state = self.env.featurize(obs)
        action = self.agent.act(agent_state)
        return action

    def initialize(self, env):
        from gym import spaces
        from tensorforce.agents import PPOAgent
        self.env = env

        if self.algorithm == "ppo":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_actions': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_actions=env.action_space.n)

            self.agent = PPOAgent(states=dict(
                type='float', shape=env.observation_space.shape),
                                  actions=actions,
                                  network=[
                                      dict(type='dense', size=64),
                                      dict(type='dense', size=64)
                                  ],
                                  batching_capacity=1000,
                                  step_optimizer=dict(type='adam',
                                                      learning_rate=1e-4))

            self.restore_model_if_exists(self.checkpoint)

        return self.agent

    def restore_model_if_exists(self, checkpoint):
        pardir = os.path.abspath(os.path.join(checkpoint, os.pardir))
        if os.path.exists(pardir):
            self.agent.restore_model(pardir)
            print("tensorforce model '{}' restored.".format(pardir))

    def save_model(self, checkpoint):
        pardir = os.path.abspath(os.path.join(checkpoint, os.pardir))
        if not os.path.exists(pardir):
            os.mkdir(pardir)
            print("checkpoint dir '{}' created.".format(pardir))
        checkpoint_path = agent.save_model(pardir, False)
        print("checkpoint model '{}' saved.".format(checkpoint_path))
Exemplo n.º 25
0

def episode_finished_train(r):
    print("Trained mother: " + str(r.episode_rewards[-1]))

    train_reward.append(r.episode_rewards[-1])
    plt.plot(train_reward, 'r+')
    plt.pause(0.01)
    return True


f = open("forex_models_gradient_2/checkpoint", "r")

lines = f.readlines()

train_reward = list();
validator_reward = list();


for i in range(1,len(lines)-1):
    split = lines[i].split()
    model_path = split[1]
    print(model_path[1:len(model_path)-1])
    real_model_path = model_path[1:len(model_path) - 1]
    train_agent.restore_model(directory='forex_models_gradient_2', file = real_model_path)
    train_runner = Runner(agent=train_agent, environment=train_env)
    train_runner.run(episodes=1, max_episode_timesteps=(candles.candle_nums + 100),episode_finished=episode_finished_train, deterministic=True)