# model = Sequential() # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(nb_actions)) # model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=100000, visualize=False, verbose=2)
observation_n = env.observation_space.shape[0] # create a model model = Sequential() model.add(Flatten(input_shape=(args.batch_size, observation_n))) # Complex Deep NN Model for i in range(args.hidden_layers): model.add(Dense(args.hidden_units)) model.add(Activation(args.activation_function)) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) Agent = AGENT_DIC[args.agent] if args.agent == 'cem': memory = EpisodeParameterMemory(limit=args.memory_limit, window_length=args.batch_size) agent = Agent(model=model, nb_actions=nb_actions, memory=memory, batch_size=args.batch_size, nb_steps_warmup=args.steps_warmup, train_interval=1, elite_frac=args.elite_frac) agent.compile() elif args.agent == 'dqn': memory = SequentialMemory(limit=args.memory_limit, window_length=args.batch_size) policy = BoltzmannQPolicy() agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory,
def main(): """Build model and train on environment.""" #env = MarketEnv(("ES", "FUT", "GLOBEX", "USD"), obs_xform=xform.BinaryDelta(3), episode_steps=STEPS_PER_EPISODE, client_id=3) #env = MarketEnv("BTC-USD", max_quantity = 10, quantity_increment = 1, obs_type = 'time', obs_size = 30, obs_xform=xform.BinaryDelta(3), episode_steps=STEPS_PER_EPISODE, client_id=2, loglevel=logging.DEBUG) env = gym.make('trading-v0').env env.initialise(symbol='000001', start='2012-01-01', end='2017-01-01', days=252) #env = MarketEnv(("AAPL", "STK", "SMART", "USD"), obs_xform=xform.BinaryDelta(3), episode_steps=STEPS_PER_EPISODE, client_id=4) nb_actions = 3 # Keras-RL CEM is a discrete agent # Option 1 : Simple model # model = Sequential([ # Flatten(input_shape=(1,) + env.observation_space.shape), # Dense(nb_actions), # Activation('softmax') # ]) # Option 2: deep network hidden_nodes = reduce(operator.imul, env.observation_space.shape, 1) model = Sequential([ Flatten(input_shape=(1, ) + env.observation_space.shape), Dense(hidden_nodes), Activation('relu'), Dense(hidden_nodes), Activation('relu'), Dense(hidden_nodes), Activation('relu'), Dense(nb_actions), Activation('softmax') ]) print(model.summary()) param_logger = CEMParamLogger('cem_{}_params.json'.format('aaaa')) callbacks = [ param_logger, FileLogger('cem_{}_log.json'.format('aaaa'), interval=STEPS_PER_EPISODE) ] theta_init = param_logger.read_params( ) # Start with last saved params if present if theta_init is not None: print('Starting with parameters from {}:\n{}'.format( param_logger.params_filename, theta_init)) memory = EpisodeParameterMemory( limit=EPISODES, window_length=1 ) # Remember the parameters and rewards for the last `limit` episodes. cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=EPISODES, nb_steps_warmup=WARMUMP_EPISODES * STEPS_PER_EPISODE, train_interval=TRAIN_INTERVAL_EPISODES, elite_frac=0.2, theta_init=theta_init, processor=DiscreteProcessor(), noise_decay_const=0, noise_ampl=0) """ :param memory: Remembers the parameters and rewards for the last `limit` episodes. :param int batch_size: Randomly sample this many episode parameters from memory before taking the top `elite_frac` to construct the next gen parameters from. :param int nb_steps_warmup: Run for this many steps (total) to fill memory before training :param int train_interval: Train (update parameters) every this many episodes :param float elite_frac: Take this top fraction of the `batch_size` randomly sampled parameters from the episode memory to construct new parameters. """ cem.compile() cem.fit(env, nb_steps=STEPS_PER_EPISODE * EPISODES, visualize=False, verbose=2, callbacks=callbacks) # cem.save_weights('cem_{}_weights.h5f'.format(env.instrument.symbol), overwrite=True) cem.test(env, nb_episodes=2, visualize=True)
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Option 1 : Simple model # model = Sequential() # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) # model.add(Dense(nb_actions)) # model.add(Activation('softmax')) # Option 2: deep network model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) model.summary() # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) if REWARD == "normal": cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() history_normal = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) cem.save_weights(os.path.join(LOG_DIR, 'cem_normal_{}_params.h5f'.format(ENV_NAME)), overwrite=True) cem.test(env, nb_episodes=5, visualize=False) pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": if not SMOOTH: processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=False) else: processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False) # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05, processor=processor_noisy) cem.compile() history_noisy = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) if not SMOOTH: cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) else: cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) cem.test(env, nb_episodes=5, visualize=False) elif REWARD == "surrogate": if not SMOOTH: processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True) else: processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True) # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05, processor=processor_surrogate) cem.compile() history_surrogate = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) if not SMOOTH: cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) else: cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) cem.test(env, nb_episodes=5, visualize=False) else: raise NotImplementedError
sys.path.append(".") from patternmatching.gray.incremental.query_call import load_graph, parse_args from patternmatching.gray.incremental.rl_model import GraphEnv logging.basicConfig(level=logging.INFO) policies = { "bqp": BoltzmannQPolicy(), # Unstable "gqp": GreedyQPolicy(), "egqp": EpsGreedyQPolicy(eps=0.1) # eps should be around 0.1 } window_length = 5 # Should be less than 20 (too large value will not converge Q-values) memories = { "epm": EpisodeParameterMemory(limit=20, window_length=window_length), # Non-episodic "sm": SequentialMemory(limit=20, window_length=window_length) # should use this } argv = sys.argv if len(argv) < 4: print("Usage: python %s [ConfFile] [Policy] [Memory]" % argv[0]) exit(1) policy_name = argv[2] if not policy_name in policies: print("Please specify correct policy name: %s" % str(policies.keys())) exit(1) policy = policies[policy_name]
# Option 2: deep network # model = Sequential() # model.add(Dense(16,input_dim=obs_dim)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(nb_actions)) # model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, max_episode_steps=200) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=100000, visualize=False, verbose=2)
def run_agent(agent): print("started new process") import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) WINDOW_LENGTH = 1 num_actions = 3 view_shape = (21, 21) input_shape = (WINDOW_LENGTH, ) + view_shape env = RestrictedViewTronEnv(agent, 10) model = Sequential() model.add(Permute((2, 3, 1), input_shape=input_shape)) model.add(Conv2D(16, (3, 3), padding="same")) model.add(Activation("relu")) model.add(Conv2D(32, (3, 3), padding="same")) model.add(Activation("relu")) model.add(Flatten()) model.add(Dense(256)) model.add(Activation("relu")) model.add(Dense(num_actions)) model.add(Activation('softmax')) np.random.seed(2363) #policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=2., # value_min=.1, value_test=.1, nb_steps=1000000 // 10) processor = TronProcessor() memory = EpisodeParameterMemory(limit=1000000, window_length=WINDOW_LENGTH) cem = CEMAgent(model, nb_actions=num_actions, memory=memory, nb_steps_warmup=50000 // 5, train_interval=4) #dqn.compile(Adam(lr=.00025), metrics=["mae"]) cem.compile() weights_filename = 'tmp/dqn_test_weights.h5f' checkpoint_weights_filename = 'tmp/dqn_test_weights_{step}.h5f' log_filename = 'tmp/dqn_test_log.json' callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000 // 10) ] callbacks += [FileLogger(log_filename, interval=10000)] def train(transfer=False): print(cem.get_config()) # todo save to file if transfer: cem.load_weights(weights_filename) cem.fit(env, callbacks=callbacks, nb_steps=1750000 // 10, log_interval=10000) cem.save_weights(weights_filename, overwrite=True) cem.test(env, nb_episodes=20, visualize=True) def opponent(): cem.load_weights('tmp/dqn_test_weights.h5f') cem.test(env, nb_episodes=200000, visualize=False) def test(): cem.load_weights('tmp/dqn_test_weights.h5f') cem.test(env, nb_episodes=20, visualize=True) # opponent() train() # True
def main(params=None): """ performs training and evaluation of params :return: model """ if params is None: params = { 'model_type': 'dqn_agent', 'l1_out': 128, 'l2_out': 64, 'gamma': 0.5, 'target_model_update': 1, 'delta_clip': 0.01, 'nb_steps_warmup': 1000 } model_type = 'dqn_agent' env_player = SimpleRLPlayer(battle_format="gen8randombattle") # print('env_player',env_player) # print('help', help(env_player)) env_player2 = SimpleRLPlayer(battle_format="gen8randombattle") opponent = RandomPlayer(battle_format="gen8randombattle") second_opponent = MaxDamagePlayer(battle_format="gen8randombattle") # Output dimension n_action = len(env_player.action_space) # model_params = { # 'n_actions': n_action, # 'l1_out': 128, # 'l2_out': 64, # 'model_type': params['model_type'] # } model_params = params model_params['n_actions'] = n_action model = get_model(model_params) # print('first model summary') # print(model.summary()) # model = Sequential() # model.add(Dense(128, activation="elu", input_shape=(1, 10))) # # # Our embedding have shape (1, 10), which affects our hidden layer # # dimension and output dimension # # Flattening resolve potential issues that would arise otherwise # model.add(Flatten()) # model.add(Dense(64, activation="elu")) # model.add(Dense(n_action, activation="linear")) # elu activation is similar to relu # https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#elu # determine memory type if params['model_type'] in {'dqn_agent', 'sarsa_agent'}: # memory = SequentialMemory(limit=10000, window_length=1) memory = SequentialMemory(limit=NB_TRAINING_STEPS, window_length=1) else: memory = EpisodeParameterMemory(limit=10000, window_length=1) # Simple epsilon greedy # What is linear annealed policy? # - this policy gives gradually decreasing thresholds for the epsilon greedy policy # - it acts as a wrapper around epsilon greedy to feed in a custom threshold pol_steps = NB_TRAINING_STEPS policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=pol_steps, ) # pol_steps = NB_TRAINING_STEPS policy_boltz = BoltzmannQPolicy(tau=1) # policy = LinearAnnealedPolicy( # BoltzmannQPolicy(), # attr="tau", # value_max=1.0, # value_min=0.05, # value_test=0, # nb_steps=pol_steps, # ) policy = policy_boltz # Defining our DQN # model = tf.keras.models.load_model('dqn_v_dqn') if params['model_type'] == 'dqn_agent': dqn = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) elif params['model_type'] == 'sarsa_agent': dqn = SARSAAgent(model=model, nb_actions=len(env_player.action_space), policy=policy, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], delta_clip=params['delta_clip']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) else: # CEMAgent # https://towardsdatascience.com/cross-entropy-method-for-reinforcement-learning-2b6de2a4f3a0 dqn = CEMAgent(model=model, nb_actions=len(env_player.action_space), memory=memory, nb_steps_warmup=params['nb_steps_warmup']) # different compile function dqn.compile() # dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # opponent dqn dqn_opponent = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn_opponent.compile(Adam(lr=0.00025), metrics=["mae"]) # NB_TRAINING_STEPS = NB_TRAINING_STEPS # rl_opponent = TrainedRLPlayer(model) # Training rounds = 4 n_steps = NB_TRAINING_STEPS // rounds for k in range(rounds): env_player.play_against( env_algorithm=dqn_training, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) env_player.play_against( env_algorithm=dqn_training, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) name = params["name"] + "_model" model.save(name) # loaded_model = tf.keras.models.load_model(name) # Evaluation print("Results against random player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) print("\nResults against max player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) return model
#Standard DQN model architecture. input_shape = (WINDOW_LENGTH * INPUT_SHAPE[0], ) frame = Input(shape=(input_shape)) dense = Dense(512, activation='relu')(frame) dense = Dense(512, activation='relu')(dense) buttons = Dense(nb_actions, activation='linear')(dense) buttons = Softmax()(buttons) model = Model(inputs=frame, outputs=buttons) print(model.summary()) processor = AtariProcessor() # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=100000, window_length=WINDOW_LENGTH) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, processor=processor, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=1000000, visualize=False, verbose=2)
# model = Sequential() # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(nb_actions)) # model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGHT) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=BATCH_SIZE, nb_steps_warmup=NB_STEPS_WARMUP, train_interval=TRAIN_INTERVAL, elite_frac=ELITE_FRAC) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=NB_STEPS, visualize=VISUALIZE_TRAIN, verbose=VERBOSE)