def test_double_dqn(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1, ))) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=1000, window_length=1) policy = EpsGreedyQPolicy(eps=.1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, target_model_update=1e-1, policy=policy, enable_double_dqn=True) dqn.compile(Adam(lr=1e-3)) dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) policy.eps = 0. h = dqn.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
def run(): env = game_env.MeleeEnv() nb_actions = env.action_space.shape[0] actor = build_network(env, nb_actions) critic, action_input = build_critic(env, nb_actions) memory = SequentialMemory(limit=25000) #random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) agent = DQNAgent( batch_size=1000, nb_actions=nb_actions, model= actor, #processor=Process(), #window_length=4,#critic_action_input=action_input, memory=memory, nb_steps_warmup=100 ) # nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, #random_process=random_process, gamma=.95, target_model_update=1e-1)#, ##delta_range=(-10., 10.)) agent.compile(RMSprop(lr=.0005), metrics=['mae']) agent.fit(env, nb_steps=100000, visualize=True, verbose=1, nb_max_start_steps=100, start_step_policy=lambda x: np.random.randint(nb_actions)) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format( str(random.randrange(0, 100000))), overwrite=True)
def train(learn_rate, model_update_interval, steps): dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50000, target_model_update=model_update_interval, policy=policy, gamma=.99, train_interval=4) dqn.compile(Adam(lr=learn_rate), metrics=['mae']) dqn.fit(env, nb_steps=steps, verbose=2, visualize=VISUALIZE) dqn.save_weights(SAVEFILE_FOLDER + "/dqn_pong_params.h5f", overwrite=True)
class DqnAgent(Agent): def __init__(self, env: gym.Env, memory=SequentialMemory(limit=50000, window_length=1), logger=Logger(), boxes_resolution=10, nb_steps_warmup=20, hidden_layers=[16, 16, 16], policy=BoltzmannQPolicy(), target_model_update=1e-2, optimizer=Adam(lr=1e-3)): self.env = env if isinstance(boxes_resolution, int): boxes_resolution = (boxes_resolution, ) * len( env.action_space.shape) self.boxes_resolution = boxes_resolution self.nb_actions = np.zeros(boxes_resolution).size model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) # TODO check this for l in hidden_layers: model.add(Dense(l, activation='relu')) model.add(Dense(self.nb_actions, activation='linear')) # TODO move this to util file? self.model = model print("dqn model summary :{0}".format(model.summary())) self.dqn = DQNAgent(model=model, nb_actions=self.nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, processor=DqnProcessor(self.boxes_resolution, env.action_space.low, env.action_space.high)) self.dqn.compile(optimizer=optimizer, metrics=['mae']) super().__init__(env, logger) def act(self, state, explore): action = self.dqn.processor.process_action(self.dqn.forward(state)) return action def train(self, nb_episodes=1000, verbose=2, visualize=True): self.dqn.fit(env=self.env, nb_steps=nb_episodes, verbose=verbose, visualize=visualize)
def test_duel_dqn(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1,))) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) memory = SequentialMemory(limit=1000, window_length=1) policy = EpsGreedyQPolicy(eps=.1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True) dqn.compile(Adam(lr=1e-3)) dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) policy.eps = 0. h = dqn.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
agent = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=100, policy=policy) agent.compile(Adam(lr=1e-3), metrics=['mae']) """# # Training and testing modes mode = args.learnmode filename = 'dqn_6' #dqn_discrete5_4lay_epsgreedpol_rwd0.01and_neg0.1_targetonly_60deg' if mode == 'train': # Train the agent tb = TensorBoard(log_dir='./logs/log_{}'.format(filename)) hist = agent.fit(env, nb_steps=100000, visualize=True, verbose=2, nb_max_episode_steps=500, callbacks=[tb]) # 20s episodes # print history print( "history contents : ", hist.history.keys()) # episode_reward, nb_episode_steps, nb_steps # summarize history for accuracy import matplotlib.pyplot as plt plt.plot(hist.history['episode_reward']) plt.plot(hist.history['nb_episode_steps']) plt.title('learning') plt.xlabel('episode') plt.legend(['episode_reward', 'nb_episode_steps'], loc='upper left') plt.show()
model.add(Flatten(input_shape=(1, states))) model.add(Dense(128, activation='relu')) model.add(Dense(52, activation='relu')) model.add(Dense(actions, activation='linear')) return model model = agent(env.observation_space.shape[0], env.action_space.n) STEPS_RICERCA = 50000 memory = SequentialMemory(limit=10000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000) dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, enable_dueling_network=True, dueling_type='avg') dqn.compile(Adam(lr=1e-3), metrics=['mae']) dqn.fit(env, nb_steps=STEPS_RICERCA, visualize=False, verbose=1) scores = dqn.test(env, nb_episodes=100, visualize=False) print('Average score over 100 test games:{}'.format( np.mean(scores.history['episode_reward'])))
## Set up the agent for training ## memory = SequentialMemory(limit=params.REPLAY_BUFFER_SIZE, window_length=1) agent = DQNAgent(model=model, policy=BoltzmannQPolicy(), memory=memory, nb_actions=action_size) agent.compile(Adam(lr=params.LR_MODEL), metrics=[params.METRICS]) ## Train ## if args.train: check_overwrite('DQN', params.ENV, args.model) history = agent.fit(env, nb_steps=params.N_STEPS_TRAIN, visualize=args.visualize, verbose=1, nb_max_episode_steps=env._max_episode_steps, log_interval=params.LOG_INTERVAL) agent.save_weights(WEIGHTS_FILES, overwrite=True) save_plot_reward('DQN', params.ENV, history, args.model, params.PARAMS) ## Test ## if not args.train: agent.load_weights(WEIGHTS_FILES) history = agent.test(env, nb_episodes=params.N_EPISODE_TEST, visualize=args.visualize, nb_max_episode_steps=env._max_episode_steps) save_result('DQN', params.ENV, history, args.model, params.PARAMS)
class Player: """Mandatory class with the player methods""" def __init__(self, name='DQN', load_model=None, env=None): """Initiaization of an agent""" self.equity_alive = 0 self.actions = [] self.last_action_in_stage = '' self.temp_stack = [] self.name = name self.autoplay = True self.dqn = None self.model = None self.env = env # if load_model: # self.model = self.load_model(load_model) def initiate_agent(self, env, model_name=None, load_memory=None, load_model=None, load_optimizer=None, load_dqn=None, batch_size=500, learn_rate=1e-3): """initiate a deep Q agent""" # tf.compat.v1.disable_eager_execution() self.env = env nb_actions = self.env.action_space.n if load_model: pass # self.model, trainable_model, target_model = self.load_model(load_model) # print(self.model.history) else: pass self.model = Sequential() self.model.add( Dense(512, activation='relu', input_shape=env.observation_space)) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(nb_actions, activation='linear')) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! if load_memory: # print(load_memory) # exit() try: memory = self.load_memory(load_memory) except: pass else: memory = SequentialMemory(limit=memory_limit, window_length=window_length) self.batch_size = batch_size self.policy = CustomEpsGreedyQPolicy() self.policy.env = self.env self.test_policy = CustomEpsGreedyQPolicy() self.test_policy.eps = 0.05 self.test_policy.env = self.env self.reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=1e-4) nb_actions = env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=self.policy, test_policy=self.test_policy, processor=CustomProcessor(), batch_size=self.batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) # timestr = time.strftime("%Y%m%d-%H%M%S") + "_" + str(model_name) # self.tensorboard = MyTensorBoard(log_dir='./Graph/{}'.format(timestr), player=self) self.dqn.compile(Adam(lr=learn_rate), metrics=['mae']) if load_model: self.load_model(load_model) # self.dqn.trainable_model = trainable_model # self.dqn.target_model = target_model # self.reduce_lr = ReduceLROnPlateau if load_optimizer: self.load_optimizer_weights(load_optimizer) def start_step_policy(self, observation): """Custom policy for random decisions for warm up.""" log.info("Random action") _ = observation legal_moves_limit = [ move.value for move in self.env.info['legal_moves'] ] action = np.random.choice(legal_moves_limit) return action def train(self, env_name, batch_size=500, policy_epsilon=0.2): """Train a model""" # initiate training loop train_vars = { 'batch_size': batch_size, 'policy_epsilon': policy_epsilon } timestr = time.strftime("%Y%m%d-%H%M%S") + "_" + str(env_name) tensorboard = TensorBoard(log_dir='./Graph/{}'.format(timestr), histogram_freq=0, write_graph=True, write_images=False) self.dqn.fit(self.env, nb_max_start_steps=nb_max_start_steps, nb_steps=nb_steps, visualize=False, verbose=2, start_step_policy=self.start_step_policy, callbacks=[tensorboard]) self.policy.eps = policy_epsilon self.dqn.save_weights("dqn_{}_model.h5".format(env_name), overwrite=True) # Save memory pickle.dump(self.dqn.memory, open("train_memory_{}.p".format(env_name), "wb")) # Save optimizer weights symbolic_weights = getattr(self.dqn.trainable_model.optimizer, 'weights') optim_weight_values = K.batch_get_value(symbolic_weights) pickle.dump(optim_weight_values, open('optimizer_weights_{}.p'.format(env_name), "wb")) # # Dump dqn # pickle.dump(self.dqn, open( "dqn_{}.p".format(env_name), "wb" )) # Finally, evaluate our algorithm for 5 episodes. self.dqn.test(self.env, nb_episodes=5, visualize=False) def load_model(self, env_name): """Load a model""" # Load the architecture # with open('dqn_{}_json.json'.format(env_name), 'r') as architecture_json: # dqn_json = json.load(architecture_json) self.dqn.load_weights("dqn_{}_model.h5".format(env_name)) # model = keras.models.load_model("dqn_{}_model.h5".format(env_name)) # trainable_model = keras.models.load_model("dqn_{}_trainable_model.h5".format(env_name)) # target_model = keras.models.load_model("dqn_{}_target_model.h5".format(env_name), overwrite=True) # return model, trainable_model, target_model def load_memory(self, model_name): memory = pickle.load(open('train_memory_{}.p'.format(model_name), "rb")) return memory def load_optimizer_weights(self, env_name): optim_weights = pickle.load( open('optimizer_weights_{}.p'.format(env_name), "rb")) self.dqn.trainable_model.optimizer.set_weights(optim_weights) def play(self, nb_episodes=5, render=False): """Let the agent play""" memory = SequentialMemory(limit=memory_limit, window_length=window_length) policy = CustomEpsGreedyQPolicy() class CustomProcessor(Processor): # pylint: disable=redefined-outer-name """The agent and the environment""" def process_state_batch(self, batch): """ Given a state batch, I want to remove the second dimension, because it's useless and prevents me from feeding the tensor into my CNN """ return np.squeeze(batch, axis=1) def process_info(self, info): processed_info = info['player_data'] if 'stack' in processed_info: processed_info = {'x': 1} return processed_info nb_actions = self.env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, processor=CustomProcessor(), batch_size=batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) self.dqn.compile(Adam(lr=1e-3), metrics=['mae']) # pylint: disable=no-member self.dqn.test(self.env, nb_episodes=nb_episodes, visualize=render) def action(self, action_space, observation, info): # pylint: disable=no-self-use """Mandatory method that calculates the move based on the observation array and the action space.""" _ = observation # not using the observation for random decision _ = info this_player_action_space = { Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT, Action.RAISE_HALF_POT, Action.RAISE_2POT } _ = this_player_action_space.intersection(set(action_space)) action = None return action
def run(): parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=['train', 'test'], default='train') parser.add_argument('--env-name', type=str, default='iemocap-rl-v3.1') parser.add_argument('--weights', type=str, default=None) parser.add_argument('--policy', type=str, default='EpsGreedyQPolicy') parser.add_argument( '--data-version', nargs='+', choices=[ DataVersions.IEMOCAP, DataVersions.SAVEE, DataVersions.IMPROV, DataVersions.ESD, DataVersions.EMODB, DataVersions.KITCHEN_EMODB, DataVersions.KITCHEN_ESD, DataVersions.KITCHEN_ESD_DB0, DataVersions.KITCHEN_ESD_DBn5, DataVersions.KITCHEN_ESD_DBn10, DataVersions.KITCHEN_ESD_DBp5, DataVersions.KITCHEN_ESD_DBp10 ], type=str2dataset, default=DataVersions.IEMOCAP) parser.add_argument('--data-split', nargs='+', type=float, default=None) parser.add_argument('--zeta-nb-steps', type=int, default=100000) parser.add_argument('--nb-steps', type=int, default=500000) parser.add_argument('--eps', type=float, default=0.1) parser.add_argument('--pre-train', type=str2bool, default=False) parser.add_argument('--pre-train-dataset', choices=[ DataVersions.IEMOCAP, DataVersions.IMPROV, DataVersions.SAVEE, DataVersions.ESD, DataVersions.EMODB ], type=str2dataset, default=DataVersions.IEMOCAP) parser.add_argument('--pre-train-data-split', type=float, default=None) parser.add_argument('--warmup-steps', type=int, default=50000) parser.add_argument('--pretrain-epochs', type=int, default=64) parser.add_argument( '--testing-dataset', type=str2dataset, default=None, choices=[ DataVersions.IEMOCAP, DataVersions.IMPROV, DataVersions.SAVEE, DataVersions.ESD, DataVersions.COMBINED, DataVersions.EMODB, DataVersions.KITCHEN_EMODB, DataVersions.KITCHEN_ESD, DataVersions.KITCHEN_ESD_DB0, DataVersions.KITCHEN_ESD_DBn5, DataVersions.KITCHEN_ESD_DBn10, DataVersions.KITCHEN_ESD_DBp5, DataVersions.KITCHEN_ESD_DBp10 ]) parser.add_argument('--gpu', type=int, default=1) parser.add_argument('--wandb-disable', type=str2bool, default=False, choices=[True, False]) parser.add_argument('--wandb-mode', type=str, default='online', choices=['online', 'offline']) parser.add_argument('--double-dqn', type=str2bool, default=False, choices=[True, False]) parser.add_argument('--dueling-network', type=str2bool, default=False, choices=[True, False]) parser.add_argument('--dueling-type', type=str, default='avg', choices=['avg', 'max', 'naive']) parser.add_argument('--schedule-csv', type=str, default=None) parser.add_argument('--schedule-idx', type=int, default=None) args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) print("Tensorflow version:", tf.__version__) if os.path.exists(f'{RESULTS_ROOT}/{time_str}'): raise RuntimeError( f'Results directory {RESULTS_ROOT}/{time_str} is already exists') gpus = tf.config.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) tf.compat.v1.experimental.output_all_intermediates(True) policy = parse_policy(args) data_version_map = {} custom_data_split = [] if args.data_split is not None: if len(args.data_split) == 1 and len(args.data_version) > 1: for i in range(len(args.data_version)): custom_data_split.append(args.data_split[0]) elif 1 < len(args.data_split) != len(args.data_version) > 1: raise RuntimeError( "--data-split either should have one value or similar to --data-version" ) else: custom_data_split = args.data_split else: for i in range(len(args.data_version)): custom_data_split.append(None) if len(args.data_version) == 1: target_datastore = get_datastore( data_version=args.data_version[0], custom_split=None if args.data_split is None else args.data_split[0]) data_version_map[args.data_version[0]] = target_datastore env = get_environment(data_version=args.data_version[0], datastore=target_datastore, custom_split=None if args.data_split is None else args.data_split[0]) else: ds = [] for i in range(len(args.data_version)): d = get_datastore(data_version=args.data_version[i], custom_split=custom_data_split[i]) data_version_map[args.data_version[i]] = d ds.append(d) target_datastore = combine_datastores(ds) env = get_environment(data_version=DataVersions.COMBINED, datastore=target_datastore, custom_split=None) for k in args.__dict__.keys(): print("\t{} :\t{}".format(k, args.__dict__[k])) env.__setattr__("_" + k, args.__dict__[k]) experiment_name = "P-{}-S-{}-e-{}-pt-{}".format(args.policy, args.zeta_nb_steps, args.eps, args.pre_train) if args.pre_train: experiment_name = "P-{}-S-{}-e-{}-pt-{}-pt-w-{}".format( args.policy, args.zeta_nb_steps, args.eps, args.pre_train, args.pre_train_dataset.name) env.__setattr__("_experiment", experiment_name) nb_actions = env.action_space.n input_layer = Input(shape=(1, NUM_MFCC, NO_features)) model = models.get_model_9_rl(input_layer, model_name_prefix='mfcc') memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=args.warmup_steps, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1., enable_double_dqn=args.double_dqn, enable_dueling_network=args.dueling_network, dueling_type=args.dueling_type) # dqn.compile(Adam(learning_rate=.00025), metrics=['mae', 'accuracy']) dqn.compile('adam', metrics=['mae', 'accuracy']) pre_train_datastore: Datastore = None if args.pre_train: if args.pre_train_dataset == args.data_version: raise RuntimeError( "Pre-Train and Target datasets cannot be the same") else: pre_train_datastore = get_datastore( data_version=args.pre_train_dataset, custom_split=args.pre_train_data_split) assert pre_train_datastore is not None (x_train, y_train, y_gen_train), _ = pre_train_datastore.get_data() pre_train_log_dir = f'{RESULTS_ROOT}/{time_str}/logs/pre_train' if not os.path.exists(pre_train_log_dir): os.makedirs(pre_train_log_dir) dqn.pre_train(x=x_train.reshape( (len(x_train), 1, NUM_MFCC, NO_features)), y=y_train, epochs=args.pretrain_epochs, batch_size=128, log_base_dir=pre_train_log_dir) if args.mode == 'train': models_dir = f'{RESULTS_ROOT}/{time_str}/models' log_dir = f'{RESULTS_ROOT}/{time_str}/logs' if not os.path.exists(models_dir): os.makedirs(models_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) print(f"Models: {models_dir}") # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = f'{models_dir}/dqn_{args.env_name}_weights.h5f' checkpoint_weights_filename = models_dir + '/dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = log_dir + '/dqn_{}_log.json'.format(args.env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=10)] if not args.wandb_disable: wandb_project_name = 'zeta-policy' wandb_dir = f'{RESULTS_ROOT}/{time_str}/wandb' if not os.path.exists(wandb_dir): os.makedirs(wandb_dir) callbacks += [ WandbLogger(project=wandb_project_name, name=args.env_name, mode=args.wandb_mode, dir=wandb_dir) ] dqn.fit(env, callbacks=callbacks, nb_steps=args.nb_steps, log_interval=10000) model = dqn.model # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Testing with Labelled Data testing_dataset = args.testing_dataset if testing_dataset is not None: if testing_dataset == DataVersions.COMBINED: if pre_train_datastore is not None: testing_datastore = combine_datastores( [target_datastore, pre_train_datastore]) else: testing_datastore = target_datastore else: testing_datastore = data_version_map[testing_dataset] else: # testing dataset is not defined if pre_train_datastore is not None: testing_datastore = combine_datastores( [target_datastore, pre_train_datastore]) else: testing_datastore = target_datastore x_test, y_test, _ = testing_datastore.get_testing_data() test_loss, test_mae, test_acc, test_mean_q = model.evaluate( x_test.reshape((len(x_test), 1, NUM_MFCC, NO_features)), y_test, verbose=1) print(f"Test\n\t Accuracy: {test_acc}") store_results(f"{log_dir}/results.txt", args=args, experiment=experiment_name, time_str=time_str, test_loss=test_loss, test_acc=test_acc) # # Finally, evaluate our algorithm for 10 episodes. # dqn.test(env, nb_episodes=10, visualize=False) elif args.mode == 'test': weights_filename = f'rl-files/models/dqn_{args.env_name}_weights.h5f' if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True) if args.schedule_csv is not None: from scheduler_callback import callback callback(args.schedule_csv, args.schedule_idx)
def deep_q_learning(): """Implementation of kreras-rl deep q learing.""" env_name = 'neuron_poker-v0' stack = 100 env = gym.make(env_name, num_of_players=5, initial_stacks=stack) np.random.seed(123) env.seed(123) env.add_player( EquityPlayer(name='equity/50/50', min_call_equity=.5, min_bet_equity=-.5)) env.add_player( EquityPlayer(name='equity/50/80', min_call_equity=.8, min_bet_equity=-.8)) env.add_player( EquityPlayer(name='equity/70/70', min_call_equity=.7, min_bet_equity=-.7)) env.add_player( EquityPlayer(name='equity/20/30', min_call_equity=.2, min_bet_equity=-.3)) env.add_player(RandomPlayer()) env.add_player(PlayerShell( name='keras-rl', stack_size=stack)) # shell is used for callback to keras rl env.reset() nb_actions = len(env.action_space) # Next, we build a very simple model. from keras import Sequential from keras.optimizers import Adam from keras.layers import Dense, Dropout from rl.memory import SequentialMemory from rl.agents import DQNAgent from rl.policy import BoltzmannQPolicy model = Sequential() model.add( Dense(64, activation='relu', input_shape=env.observation_space)) model.add(Dropout(0.2)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(nb_actions, activation='linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(env_name), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True)
x = Dense(16, activation='relu')(x) x = Dense(16, activation='relu')(x) x = Dense(16, activation='relu')(x) output = Dense(num_actions, activation='linear')(x) model = Model(inputs=input, outputs=output) print(model.summary()) return model env = gym.make("CartPole-v1") num_actions = env.action_space.n model = build_model(env.observation_space.shape[0], num_actions) model.compile(Adam(lr=1e-3), metrics=['mae']) memory = SequentialMemory(limit=50000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=10000) dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=10, enable_double_dqn=False, policy=policy) dqn.fit(env, nb_steps=500, visualize=False, verbose=2)
# even the metrics! policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=5000, window_length=1) agent = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=nb_actions, nb_steps_warmup=500, target_model_update=1e-2) agent.compile(Adam(lr=1e-3), metrics=['mse']) # %% # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=1000) # %% # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # %% # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, nb_max_episode_steps=1000, visualize=False) # %%
# inputs = layers.Input(shape=(84, 84, 4,)) inputs = layers.Input(shape=(4, ) + state_size) layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs) layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1) layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2) layer4 = layers.Flatten()(layer3) layer5 = layers.Dense(512, activation="relu")(layer4) action = layers.Dense(num_actions, activation="linear")(layer5) return k.Model(inputs=inputs, outputs=action) model = build_model(state_size, num_actions) model.summary() policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.1, nb_steps=1000000) memory = SequentialMemory(limit=1000000, window_length=4) agent = DQNAgent(model=model, policy=policy, nb_actions=num_actions, memory=memory, nb_steps_warmup=50000) agent.compile(k.optimizers.Adam(learning_rate=.00025), metrics=['mae']) agent.fit(env, nb_steps=100000, log_interval=1000, visualize=False, verbose=2) agent.save_weights('policy.h5', overwrite=True)
model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=4, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-4), metrics=['mae', 'accuracy']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. nb_steps = {(5, 5): 10000, (10, 10): 15000, (20, 20): 1000000} dqn.fit(env, nb_steps=nb_steps[world_size], visualize=False, verbose=2) hist = dqn.test(env, nb_episodes=200, visualize=False) results = hist.history['nb_steps'] print(f'\n####{world_size}####') print(np.mean(results)) print( st.t.interval(0.9, len(results) - 1, loc=np.mean(results), scale=st.sem(results))) print('##########')
""" build the keras model for deep learning """ # inputs = layers.Input(shape=(84, 84, 4,)) inputs = layers.Input(shape=(4, ) + state_size) layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs) layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1) layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2) layer4 = layers.Flatten()(layer3) layer5 = layers.Dense(512, activation="relu")(layer4) action = layers.Dense(num_actions, activation="linear")(layer5) return k.Model(inputs=inputs, outputs=action) model = build_model(state_size, num_actions) model.summary() """ policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.1, nb_steps=1000000) """ memory = SequentialMemory(limit=1000000, window_length=4) agent = DQNAgent(model=model, policy=GreedyQPolicy(), nb_actions=num_actions, memory=memory, nb_steps_warmup=50000) agent.compile(k.optimizers.Adam(learning_rate=.00025), metrics=['mae']) """ agent.fit(env, nb_steps=10000, log_interval=1000, visualize=False, verbose=2) """ agent.load_weights('policy.h5') agent.test(env, nb_episodes=10, visualize=False)
target_model_update=INTERVAL, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'save/dqn_{}_weights.h5f'.format('act') checkpoint_weights_filename = 'save/dqn_weights_{step}.h5f' log_filename = 'save/dqn_{}_log.json'.format('act') callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=INTERVAL) ] dqn.fit(env, nb_steps=STEPS, log_interval=INTERVAL, visualize=False, callbacks=callbacks) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=True) # # Okay, now it's time to learn something! We visualize the training here for show, but this # # slows down training quite a lot. You can always safely abort the training prematurely using # # Ctrl + C. # dqn.fit(env, nb_steps=len(price_history.get_dataframe()) - 5, visualize=True, verbose=2, callbacks=[TrainEpisodeLogger()], ) # # After training is done, we save the final weights. # dqn.save_weights('dqn_{}_weights.h5f'.format('price_action'), overwrite=True)
def run(): parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=['train', 'test'], default='train') parser.add_argument('--env-name', type=str, default='iemocap-rl-v3.1') parser.add_argument('--weights', type=str, default=None) parser.add_argument('--policy', type=str, default='EpsGreedyQPolicy') parser.add_argument('--data-version', choices=[DataVersions.IEMOCAP, DataVersions.SAVEE, DataVersions.IMPROV], type=str2dataset, default=DataVersions.IEMOCAP) parser.add_argument('--disable-wandb', type=str2bool, default=False) parser.add_argument('--zeta-nb-steps', type=int, default=1000000) parser.add_argument('--nb-steps', type=int, default=500000) parser.add_argument('--max-train-steps', type=int, default=440000) parser.add_argument('--eps', type=float, default=0.1) parser.add_argument('--pre-train', type=str2bool, default=False) parser.add_argument('--pre-train-dataset', choices=[DataVersions.IEMOCAP, DataVersions.IMPROV, DataVersions.SAVEE], type=str2dataset, default=DataVersions.IEMOCAP) parser.add_argument('--warmup-steps', type=int, default=50000) parser.add_argument('--pretrain-epochs', type=int, default=64) parser.add_argument('--gpu', type=int, default=1) args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) config = tf.ConfigProto(gpu_options=gpu_options) config.gpu_options.allow_growth = True sess = tf.Session(config=config) tf.compat.v1.keras.backend.set_session(sess) policy = parse_policy(args) data_version = args.data_version env: gym.Env = None if data_version == DataVersions.IEMOCAP: env = IemocapEnv(data_version) if data_version == DataVersions.SAVEE: env = SaveeEnv(data_version) if data_version == DataVersions.IMPROV: env = ImprovEnv(data_version) for k in args.__dict__.keys(): print("\t{} :\t{}".format(k, args.__dict__[k])) env.__setattr__("_" + k, args.__dict__[k]) experiment_name = "P-{}-S-{}-e-{}-pt-{}".format(args.policy, args.zeta_nb_steps, args.eps, args.pre_train) if args.pre_train: experiment_name = "P-{}-S-{}-e-{}-pt-{}-pt-w-{}".format(args.policy, args.zeta_nb_steps, args.eps, args.pre_train, args.pre_train_dataset.name) env.__setattr__("_experiment", experiment_name) nb_actions = env.action_space.n input_layer = Input(shape=(1, NUM_MFCC, NO_features)) model = models.get_model_9_rl(input_layer, model_name_prefix='mfcc') memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=args.warmup_steps, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1., train_max_steps=args.max_train_steps) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.pre_train: from feature_type import FeatureType datastore: Datastore = None if args.pre_train_dataset == DataVersions.IMPROV: from datastore_iemocap import IemocapDatastore datastore = IemocapDatastore(FeatureType.MFCC) if args.pre_train_dataset == DataVersions.Vimprov: from datastore_improv import ImprovDatastore datastore = ImprovDatastore(22) if args.pre_train_dataset == DataVersions.Vsavee: from datastore_savee import SaveeDatastore datastore = SaveeDatastore(FeatureType.MFCC) assert datastore is not None x_train, y_train, y_gen_train = datastore.get_pre_train_data() dqn.pre_train(x=x_train.reshape((len(x_train), 1, NUM_MFCC, NO_features)), y=y_train, EPOCHS=args.pretrain_epochs, batch_size=128) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'rl-files/models/dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'rl-files/models/dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'rl-files/logs/dqn_{}_log.json'.format(args.env_name) callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] callbacks += [FileLogger(log_filename, interval=100)] if not args.disable_wandb: wandb_project_name = 'zeta-policy' callbacks += [WandbLogger(project=wandb_project_name, name=args.env_name)] dqn.fit(env, callbacks=callbacks, nb_steps=args.nb_steps, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=False) elif args.mode == 'test': weights_filename = 'rl-files/models/dqn_{}_weights.h5f'.format(args.env_name) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True)
#policies: #callback class EpsDecayCallback(Callback): def __init__(self, eps_policy, decay_rate=0.95): self.eps_policy = eps_policy self.decay_rate = decay_rate def on_episode_begin(self, episode, logs={}): self.eps_policy.eps *= self.decay_rate policy = EpsGreedyQPolicy(eps=1.0) memory = SequentialMemory(limit=500000, window_length=1) agent = DQNAgent(model=Network(), policy=policy, memory=memory, enable_double_dqn=False, nb_actions=env.action_space.n, nb_steps_warmup=10, target_model_update=1e-2) agent.compile(optimizer=Adam(lr=0.002, decay=2.25e-05), metrics=['mse']) agent.fit(env=env, callbacks=[EpsDecayCallback(eps_policy=policy, decay_rate=0.975)], verbose=2, nb_steps=300000) agent.save_weights('model.hdf5') agent.test(env=env, nb_episodes=100, visualize=True)
obsprocesser = ObservationProcessor() dqn = DQNAgent(model=model, nb_actions=2, memory=memory, nb_steps_warmup=60, target_model_update=1e-2, policy=policy, processor=obsprocesser) dqn.compile(Adam(lr=1e-3), metrics=['mae']) #dqn.load_weights(model_path) #dqn.load_weights('dqn_{}_weights.h5f'.format('trading')) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=2, callbacks=[chk_point]) # After training is done, we save the final weights. dqn.save_weights(model_path, overwrite=True) #dqn.save_weights('dqn_{}_weights.h5f'.format('trading'), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True)
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=10) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) dqn.fit(env, nb_steps=1750000, log_interval=10000, nb_max_episode_steps=50)
# print(model.summary()) print(model.output._keras_shape) return model if __name__ == '__main__': env = myTGym(episode_type='0', percent_goal_profit=2, percent_stop_loss=5) # s1, s2, s3 = env.reset() # state = aggregate_state(s1, s2, s3) memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() model = build_network() dqn = DQNAgent(model=model, nb_actions=2, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format('trading'), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True)
class Player: """Mandatory class with the player methods""" def __init__(self, name='DQN', load_model=None, env=None): """Initiaization of an agent""" self.equity_alive = 0 self.actions = [] self.last_action_in_stage = '' self.temp_stack = [] self.name = name self.autoplay = True self.dqn = None self.model = None self.env = env if load_model: self.load(load_model) def initiate_agent(self, env): """initiate a deep Q agent""" tf.compat.v1.disable_eager_execution() self.env = env nb_actions = self.env.action_space.n self.model = Sequential() self.model.add( Dense(512, activation='relu', input_shape=env.observation_space)) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(nb_actions, activation='linear')) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=memory_limit, window_length=window_length) policy = TrumpPolicy() nb_actions = env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, processor=CustomProcessor(), batch_size=batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) self.dqn.compile(Adam(lr=1e-3), metrics=['mae']) def start_step_policy(self, observation): """Custom policy for random decisions for warm up.""" log.info("Random action") _ = observation action = self.env.action_space.sample() return action def train(self, env_name): """Train a model""" # initiate training loop timestr = time.strftime("%Y%m%d-%H%M%S") + "_" + str(env_name) tensorboard = TensorBoard(log_dir='./Graph/{}'.format(timestr), histogram_freq=0, write_graph=True, write_images=False) self.dqn.fit(self.env, nb_max_start_steps=nb_max_start_steps, nb_steps=nb_steps, visualize=False, verbose=2, start_step_policy=self.start_step_policy, callbacks=[tensorboard]) # Save the architecture dqn_json = self.model.to_json() Path("dqn_results").mkdir(parents=True, exist_ok=True) with open("dqn_results/dqn_{}_json.json".format(env_name), "w") as json_file: json.dump(dqn_json, json_file) # After training is done, we save the final weights. self.dqn.save_weights('dqn_results/dqn_{}_weights.h5'.format(env_name), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. self.dqn.test(self.env, nb_episodes=5, visualize=False) def load(self, env_name): """Load a model""" # Load the architecture with open('dqn_results/dqn_{}_json.json'.format(env_name), 'r') as architecture_json: dqn_json = json.load(architecture_json) self.model = model_from_json(dqn_json) self.model.load_weights( 'dqn_results/dqn_{}_weights.h5'.format(env_name)) def play(self, nb_episodes=5, render=False): """Let the agent play""" memory = SequentialMemory(limit=memory_limit, window_length=window_length) policy = TrumpPolicy() class CustomProcessor(Processor): # pylint: disable=redefined-outer-name """The agent and the environment""" def process_state_batch(self, batch): """ Given a state batch, I want to remove the second dimension, because it's useless and prevents me from feeding the tensor into my CNN """ return np.squeeze(batch, axis=1) def process_info(self, info): processed_info = info['player_data'] if 'stack' in processed_info: processed_info = {'x': 1} return processed_info nb_actions = self.env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, processor=CustomProcessor(), batch_size=batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) self.dqn.compile(Adam(lr=1e-3), metrics=['mae']) # pylint: disable=no-member self.dqn.test(self.env, nb_episodes=nb_episodes, visualize=render) def action(self, action_space, observation, info): # pylint: disable=no-self-use """Mandatory method that calculates the move based on the observation array and the action space.""" _ = observation # not using the observation for random decision _ = info this_player_action_space = { Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT, Action.RAISE_HALF_POT, Action.RAISE_2POT } _ = this_player_action_space.intersection(set(action_space)) action = None return action
def main(): """Create environment, build models, train.""" #env = MarketEnv(("ES", "FUT", "GLOBEX", "USD"), obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=3) #env = MarketEnv(("EUR", "CASH", "IDEALPRO", "USD"), max_quantity=20000, quantity_increment=20000, obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=5, afterhours=False) env = gym.make('trading-v0').env env.initialise(symbol='000001', start='2012-01-01', end='2017-01-01', days=252) nb_actions = env.action_space.n obs_size = np.product(env.observation_space.shape) # # Actor model # dropout = 0.1 # actor = Sequential([ # Flatten(input_shape=(1,) + env.observation_space.shape), # BatchNormalization(), # Dense(obs_size, activation='relu'), # GaussianDropout(dropout), # BatchNormalization(), # Dense(obs_size, activation='relu'), # GaussianDropout(dropout), # BatchNormalization(), # Dense(obs_size, activation='relu'), # GaussianDropout(dropout), # BatchNormalization(), # Dense(1, activation='tanh'), # ]) # print('Actor model') # actor.summary() # action_input = Input(shape=(1,), name='action_input') # observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') # flattened_observation = Flatten()(observation_input) # x = concatenate([action_input, flattened_observation]) # x = BatchNormalization()(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(1, activation='linear')(x) # critic = Model(inputs=[action_input, observation_input], outputs=x) # print('\nCritic Model') # critic.summary() from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(160)) model.add(Activation('relu')) model.add(Dense(160)) model.add(Activation('relu')) model.add(Dense(160)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) memory = SequentialMemory(limit=EPISODES * STEPS_PER_EPISODE, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.5, mu=0., sigma=.5) #agent = DQNAgent(nb_actions=1, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=STEPS_PER_EPISODE * WARMUP_EPISODES, nb_steps_warmup_actor=STEPS_PER_EPISODE * WARMUP_EPISODES, random_process=random_process, gamma=0.95, target_model_update=0.01) from rl.policy import BoltzmannQPolicy policy = BoltzmannQPolicy() agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) agent.compile(Adam(lr=1e-3), metrics=['mae']) #weights_filename = 'ddpg_{}_weights.h5f'.format(env.instrument.symbol) try: #agent.load_weights(weights_filename) #print('Using weights from {}'.format(weights_filename)) # DDPGAgent actually uses two separate files for actor and critic derived from this filename pass except IOError: pass agent.fit(env, nb_steps=EPISODES * STEPS_PER_EPISODE, visualize=False, verbose=2) #agent.save_weights(weights_filename, overwrite=True) agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=STEPS_PER_EPISODE)