memory = PriorityMemory(limit=MEMORY_SIZE, minority_chance=MINORITY_CHANCE, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=EPS_MAX, value_min=EPS_MIN, value_test=0.05, nb_steps=EPS_STEPS) dqn = DQNAgent(model=model, policy=policy, nb_actions=2, memory=memory, processor=processor, nb_steps_warmup=WARMUP_STEPS, gamma=GAMMA, target_model_update=TARGET_MODEL_UPDATE, train_interval=1, delta_clip=1, batch_size=BATCH_SIZE, enable_double_dqn=DOUBLE_DQN) dqn.compile(Adam(lr=LR)) metrics = Metrics(X_val, y_val, interval=5_000) dqn.fit(env, nb_steps=training_steps, log_interval=LOG_INTERVAL, callbacks=[metrics]) dqn.target_model.save(FP_MODEL) # Validate on test dataset
self.rewards[self.episode] = rw self.avgrewards[self.episode] = np.mean(self.rewardbuf) self.plot() self.episode += 1 def plot(self): self.grphinst.set_ydata(self.rewards) self.grphavg.set_ydata(self.avgrewards) plt.draw() plt.pause(0.01) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, enable_double_dqn=False) dqn.compile(Adam(lr=0.002, decay=2.25e-05), metrics=['mse']) cbs = [EpsDecayCallback(eps_poilcy=policy, decay_rate=0.975)] cbs += [LivePlotCallback(nb_episodes=4000, avgwindow=20)] dqn.fit(env, nb_steps=1000000, visualize=True, verbose=2, callbacks=cbs) dqn.save_weights('monitor/dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # evaluate the algorithm for 100 episodes. dqn.test(env, nb_episodes=100, visualize=True) env.close()
def run(self): #Initiates the environments, trainEnv=validEnv=testEnv=" " iteration=-1 #While we did not pass through all the dates (i.e., while all the walks were not finished) #walk size is train+validation+test size #currentStarting point begins with begin date while(self.currentStartingPoint+self.walkSize <= self.endingPoint): #Iteration is the current walk iteration+=1 #Initiate the output file self.outputFile=open(self.outputFileName+str(iteration+1)+".csv", "w+") #write the first row of the csv self.outputFile.write( "Iteration,"+ "trainAccuracy,"+ "trainCoverage,"+ "trainReward,"+ "trainLong%,"+ "trainShort%,"+ "trainLongAcc,"+ "trainShortAcc,"+ "trainLongPrec,"+ "trainShortPrec,"+ "validationAccuracy,"+ "validationCoverage,"+ "validationReward,"+ "validationLong%,"+ "validationShort%,"+ "validationLongAcc,"+ "validationShortAcc,"+ "validLongPrec,"+ "validShortPrec,"+ "testAccuracy,"+ "testCoverage,"+ "testReward,"+ "testLong%,"+ "testShort%,"+ "testLongAcc,"+ "testShortAcc,"+ "testLongPrec,"+ "testShortPrec\n") #Empty the memory and agent del(self.memory) del(self.agent) #Define the memory and agent #Memory is Sequential self.memory = SequentialMemory(limit=10000, window_length=1) #Agent is initiated as passed through parameters self.agent = DQNAgent(model=self.model, policy=self.policy, nb_actions=self.nbActions, memory=self.memory, nb_steps_warmup=200, target_model_update=1e-1, enable_double_dqn=True,enable_dueling_network=True) #Compile the agent with Adam initialization self.agent.compile(Adam(lr=1e-3), metrics=['mae']) #Load the weights saved before in a random way if it is the first time self.agent.load_weights("q.weights") ########################################TRAINING STAGE######################################################## #The TrainMinLimit will be loaded as the initial date at the beginning, and will be updated later. #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date trainMinLimit=None while(trainMinLimit is None): try: trainMinLimit = self.sp.get_loc(self.currentStartingPoint) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) #The TrainMaxLimit will be loaded as the interval between the initial date plus the training size. #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date trainMaxLimit=None while(trainMaxLimit is None): try: trainMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) ########################################VALIDATION STAGE####################################################### #The ValidMinLimit will be loaded as the next element of the TrainMax limit validMinLimit=trainMaxLimit+1 #The ValidMaxLimit will be loaded as the interval after the begin + train size +validation size #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date validMaxLimit=None while(validMaxLimit is None): try: validMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize+self.validationSize) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) ########################################TESTING STAGE######################################################## #The TestMinLimit will be loaded as the next element of ValidMaxlimit testMinLimit=validMaxLimit+1 #The testMaxLimit will be loaded as the interval after the begin + train size +validation size + Testsize #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date testMaxLimit=None while(testMaxLimit is None): try: testMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize+self.validationSize+self.testSize) except: self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0) #Separate the Validation and testing data according to the limits found before #Prepare the training and validation files for saving them later ensambleValid=pd.DataFrame(index=self.dates[validMinLimit:validMaxLimit].ix[:,'Date'].drop_duplicates().tolist()) ensambleTest=pd.DataFrame(index=self.dates[testMinLimit:testMaxLimit].ix[:,'Date'].drop_duplicates().tolist()) #Put the name of the index for validation and testing ensambleValid.index.name='Date' ensambleTest.index.name='Date' #Explorations are epochs considered, or how many times the agent will play the game. for eps in self.explorations: #policy will be 0.2, so the randomness of predictions (actions) will happen with 20% of probability self.policy.eps = eps[0] #there will be 100 iterations (epochs), or eps[1]) for i in range(0,eps[1]): del(trainEnv) #Define the training, validation and testing environments with their respective callbacks trainEnv = SpEnv(operationCost=self.operationCost,minLimit=trainMinLimit,maxLimit=trainMaxLimit,callback=self.trainer,isOnlyShort=self.isOnlyShort) del(validEnv) validEnv=SpEnv(operationCost=self.operationCost,minLimit=validMinLimit,maxLimit=validMaxLimit,callback=self.validator,isOnlyShort=self.isOnlyShort,ensamble=ensambleValid,columnName="iteration"+str(i)) del(testEnv) testEnv=SpEnv(operationCost=self.operationCost,minLimit=testMinLimit,maxLimit=testMaxLimit,callback=self.tester,isOnlyShort=self.isOnlyShort,ensamble=ensambleTest,columnName="iteration"+str(i)) #Reset the callback self.trainer.reset() self.validator.reset() self.tester.reset() #Reset the training environment trainEnv.resetEnv() #Train the agent self.agent.fit(trainEnv,nb_steps=floor(self.trainSize.days-self.trainSize.days*0.2),visualize=False,verbose=0) #Get the info from the train callback (_,trainCoverage,trainAccuracy,trainReward,trainLongPerc,trainShortPerc,trainLongAcc,trainShortAcc,trainLongPrec,trainShortPrec)=self.trainer.getInfo() #Print Callback values on the screen print(str(i) + " TRAIN: acc: " + str(trainAccuracy)+ " cov: " + str(trainCoverage)+ " rew: " + str(trainReward)) #Reset the validation environment validEnv.resetEnv() #Test the agent on validation data self.agent.test(validEnv,nb_episodes=floor(self.validationSize.days-self.validationSize.days*0.2),visualize=False,verbose=0) #Get the info from the validation callback (_,validCoverage,validAccuracy,validReward,validLongPerc,validShortPerc,validLongAcc,validShortAcc,validLongPrec,validShortPrec)=self.validator.getInfo() #Print callback values on the screen print(str(i) + " VALID: acc: " + str(validAccuracy)+ " cov: " + str(validCoverage)+ " rew: " + str(validReward)) #Reset the testing environment testEnv.resetEnv() #Test the agent on testing data self.agent.test(testEnv,nb_episodes=floor(self.validationSize.days-self.validationSize.days*0.2),visualize=False,verbose=0) #Get the info from the testing callback (_,testCoverage,testAccuracy,testReward,testLongPerc,testShortPerc,testLongAcc,testShortAcc,testLongPrec,testShortPrec)=self.tester.getInfo() #Print callback values on the screen print(str(i) + " TEST: acc: " + str(testAccuracy)+ " cov: " + str(testCoverage)+ " rew: " + str(testReward)) print(" ") #write the walk data on the text file self.outputFile.write( str(i)+","+ str(trainAccuracy)+","+ str(trainCoverage)+","+ str(trainReward)+","+ str(trainLongPerc)+","+ str(trainShortPerc)+","+ str(trainLongAcc)+","+ str(trainShortAcc)+","+ str(trainLongPrec)+","+ str(trainShortPrec)+","+ str(validAccuracy)+","+ str(validCoverage)+","+ str(validReward)+","+ str(validLongPerc)+","+ str(validShortPerc)+","+ str(validLongAcc)+","+ str(validShortAcc)+","+ str(validLongPrec)+","+ str(validShortPrec)+","+ str(testAccuracy)+","+ str(testCoverage)+","+ str(testReward)+","+ str(testLongPerc)+","+ str(testShortPerc)+","+ str(testLongAcc)+","+ str(testShortAcc)+","+ str(testLongPrec)+","+ str(testShortPrec)+"\n") #Close the file self.outputFile.close() #For the next walk, the current starting point will be the current starting point + the test size #It means that, for the next walk, the training data will start 6 months after the training data of #the previous walk self.currentStartingPoint+=self.testSize #Write validation and Testing data into files #Save the files for processing later with the ensemble considering the 100 epochs ensambleValid.to_csv("./Output/ensemble/"+self.ensembleFolderName+"/walk"+str(iteration)+"ensemble_valid.csv") ensambleTest.to_csv("./Output/ensemble/"+self.ensembleFolderName+"/walk"+str(iteration)+"ensemble_test.csv")
policy = EpsGreedyQPolicy(eps=eps) policy = LinearAnnealedPolicy(policy, attr='eps', value_max=eps, value_min=0, value_test=0, nb_steps=nb_steps) test_policy = GreedyQPolicy() #################################################################### dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=window_length + batch_size, target_model_update=0.02, policy=policy, test_policy=test_policy, batch_size=batch_size, train_interval=train_interval, gamma=gamma) dqn.compile(Adam(lr=0.00025), metrics=['mae']) dqn.load_weights('dqn_{}_weights.h5f'.format('lunar')) #################################################################### nb_episodes = 10 history = dqn.test(env, nb_episodes=nb_episodes)
# Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1, value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can use the built-in Keras callbacks! os.chdir(DIR) os.chdir('runs') if not args.run_id: run_id = '{}_{}'.format(args.env_name, time()) else: run_id = args.run_id os.mkdir(run_id) weights_filename = '{}/weights.h5f'.format(run_id) checkpoint_weights_filename = weights_filename + '_{step}.h5f'
model.add(Dense(512, activation='linear')) model.add(LeakyReLU(alpha=.001)) model.add(Dense(1024, activation='linear')) model.add(LeakyReLU(alpha=.001)) model.add(Dense(512, activation='linear')) model.add(LeakyReLU(alpha=.001)) model.add(Dense(nb_actions)) model.add(Activation('linear')) policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=10000, window_length=50) dqn = DQNAgent(model=model, policy=policy, nb_actions=nb_actions, memory=memory, nb_steps_warmup=400, target_model_update=1e-1, enable_double_dqn=True, enable_dueling_network=True) dqn.compile(Adam(lr=1e-3), metrics=['mae']) outputFile = open("2105.csv", "w+") outputFile.write( "iteration,trainAccuracy,trainCoverage,trainReward,validationAccuracy,validationCoverage,validationReward\n" ) iteration = 0 for i in range(0, 100): dqn.fit(trainEnv, nb_steps=3000, visualize=False,
model.add(Dense(len(env.VoltageMag))) model.add(Activation('relu')) model.add(Dense(env.action_space.n)) model.add(Activation('linear')) print(model.summary()) # policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1, value_min=1e-2, value_test=.05, nb_steps=20000) memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory, nb_steps_warmup=1000, target_model_update=1e-3, policy=policy) dqn.compile(Adam(lr=1e-1), metrics=['mae']) # # sys.stdout = open(currentDirectory + "/test4.txt", "w") history = dqn.fit(env, nb_steps=20000, verbose=2) # sys.stdout.close() # Save model dqn.model.save(currentDirectory + "/saved_model") print("Saved model to disk") # Enjoy trained agent
def training_game(): env = Environment( map_name="ForceField", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.2, value_test=.0, nb_steps=1e2) # Agent dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=True, enable_dueling_network=True, # 2019-07-12 GU Zhan (Sam) # nb_steps_warmup=500, target_model_update=1e-2, policy=policy, nb_steps_warmup=1500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor, delta_clip=1) dqn.compile(Adam(lr=.001), metrics=["mae", "acc"]) # Tensorboard callback timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}" # 2019-07-12 GU Zhan (Sam) folder name for Lunux: # callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0, # write_graph=True, write_images=False) # 2019-07-12 GU Zhan (Sam) folder name for Windows: callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = "agent" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) class Saver(Callback): def on_episode_end(self, episode, logs={}): if episode % 200 == 0: self.model.save_weights(w_file, overwrite=True) s = Saver() logs = FileLogger('DQN_Agent_log.csv', interval=1) # dqn.fit(env, callbacks=[callbacks,s,logs], nb_steps=600, action_repetition=2, dqn.fit(env, callbacks=[callbacks, s, logs], nb_steps=3000, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
## Init ML-model for agent limit = observation_shape[1] model = cnn_model_2in_with_feedback( (limit, 4), (limit, 4), market.feedback_shape, nb_actions, 'softmax') ## Init RL-metod parameters memory = SequentialMemory(limit=10000, window_length=1) # TODO implement policies for multiply symbols policy = BoltzmannQPolicy() ## Init RL agent agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy, processor=MultiInputProcessor(3), enable_dueling_network=True, dueling_type='avg') agent.compile(Adam(lr=1e-3), metrics=['mae']) try: ## Comment here if you want to start learning again agent.load_weights('{p}/dqn_{fn}_weights.h5f'.format(p=PATH, fn=ENV_NAME)) pass except OSError as e: print(e) except ValueError as e: print(e)
env.BitInt.step = step_size n_steps = end_buffer / step_size gamma = 0.95 # Do prefitting policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1, value_min=0.05, value_test=0, nb_steps=100000) agent = DQNAgent(model=actor, nb_actions=nb_actions, policy=policy, memory=memory, test_policy=policy, nb_steps_warmup=50, gamma=gamma, target_model_update=10000, train_interval=4, delta_clip=1.) agent.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) agent.fit(env, nb_steps=100000, visualize=False, verbose=1, nb_max_episode_steps=n_steps, log_interval=1000) # Do early stopping fit test_scores = []
def main(): ml_variables = FXU.getMLVariables() actions_table_details = { 'name': 'metaactions', 'col': ['Action', 'Time'], 'type': ['VARCHAR(20)', 'datetime'], 'null': [False, False] } ### Clear the actions table FXU.execute_query_db("DELETE FROM metaactions") env = ForexEnv(type="train", inputSymbol="EURUSD", show_trade=True) env_test = ForexEnv(type="test", inputSymbol="EURUSD", show_trade=True) n_actions = env.action_space.n print("Number of actions : ", n_actions) model = create_model(shape=env.observation_space.shape, n_actions=n_actions) print(model.summary()) #### Configuring the agent memory = SequentialMemory(limit=100000, window_length=env.window_size) policy = EpsGreedyQPolicy() # enable the dueling network # you can specify the dueling_type to one of {'avg','max','naive'} dqn = DQNAgent(model=model, nb_actions=n_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='naive', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-4), metrics=['mse']) if ml_variables['LoadWeights'] != 'no': path = 'model\\' + ml_variables['LoadWeights'] + ".h5f" if ospath.isfile(path): print("Weights exist, so Going to load the weights") dqn.load_weights(path) max_reward = -1000000 while True: # Train : dqn.fit(env, nb_steps=env.split_point, nb_max_episode_steps=60000, visualize=False, verbose=2) try: info = dqn.test(env_test, nb_episodes=1, visualize=False) #reward = info.history['episode_reward'] reward = env_test.balance - env_test.starting_balance print("reward : ", reward) if reward > int(max_reward) and int(reward) != 0: max_reward = int(reward) np.array([info.history ]).dump('./info/duel_dqn_reward_{0}_{1}.info'.format( env_test.symbol, max_reward)) dqn.save_weights('./model/duel_dqn_reward_{0}_{1}.h5f'.format( env_test.symbol, max_reward)) #print("Info of testing : ",info.history) #n_buys, n_lostBuys, n_sells, n_lostSells, portfolio = info['buys'], info['lostBuys'], info['sells'], info['lostBuys'] #np.array([info]).dump('./info/duel_dqn_{0}_weights_{1}LS_{2}_{3}.info'.format(env_test.symbol, portfolio, n_buys, n_sells)) except KeyboardInterrupt: return ##### Saving weights after each fitting to resume afterwards ############### if ml_variables['LoadWeights'] != 'no': dqn.save_weights(filepath='model\\' + ml_variables['LoadWeights'] + ".h5f", overwrite=True)
from rl.memory import SequentialMemory from rl.core import Processor from rl.callbacks import ModelIntervalCheckpoint from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv2D, MaxPooling2D from keras.losses import categorical_crossentropy from keras.optimizers import Adam env = gym.make("slitherio-v0", headless=False, width=500, height=500) # model = conv_model(env) model = full_combined_conv_lstm_model(env) # model = load_model("current_model.h5") print(model.weights) model.load_weights("new_current_model.h5") print(model.weights) policy = EpsGreedyQPolicy(eps=0.2) memory = SequentialMemory(limit=DQN_MEMORY_SIZE, window_length=1) dqn = DQNAgent( model=model, nb_actions=env.action_space.n, memory=memory, # nb_steps_warmup=DQN_MEMORY_SIZE, target_model_update=1e-2, policy=policy, processor=LSTMProcessor(), ) dqn.compile(Adam(lr=1e-3), metrics=["mae"]) dqn.test(env, nb_episodes=5, visualize=True)
def main(args): base = os.path.expanduser(args.data) if not os.path.exists(base): os.makedirs(base) if not os.path.isdir(base): raise Exception("Specified data directory is not a directory.") alias = "dqn-%s-%s-%s-%d-%d-%d" % (args.reward, args.activation, args.policy, args.neurons, args.interval, args.memlength) logfile = None weightfile = None logfile = os.path.join(base, "%s_log.tsv" % alias) weightsfile = os.path.join(base, "%s_weights.h5f" % alias) env = AutoturnSF_Env(alias, 4, visualize=args.visualize, reward=args.reward, port=args.port) nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(args.neurons)) model.add(get_activation(args.activation)) model.add(Dense(args.neurons)) model.add(get_activation(args.activation)) model.add(Dense(args.neurons)) model.add(get_activation(args.activation)) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) STEPS_PER_EPISODE = env.get_max_frames() / args.frameskip memory = SequentialMemory(limit=STEPS_PER_EPISODE * 500) #policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=.5, value_min=.05, value_test=.01, nb_steps=STEPS_PER_EPISODE*1000) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=.05, value_min=.001, value_test=.001, nb_steps=STEPS_PER_EPISODE * 1000) agent = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, window_length=1, memory=memory, train_interval=args.interval, nb_steps_warmup=STEPS_PER_EPISODE * 1, gamma=.99, target_model_update=STEPS_PER_EPISODE * 1) agent.compile(Adam(lr=.0001), metrics=['mae']) if args.weights != None and os.path.isfile(args.weights): print("Loading weights from file: %s" % args.weights) agent.load_weights(args.weights) log = TrainEpisodeFileLogger(env, agent, logfile, weightsfile) agent.fit(env, nb_steps=STEPS_PER_EPISODE * 10000, visualize=True, verbose=2, action_repetition=args.frameskip, callbacks=[log])
""" import gym import h5py import keras as K from keras import layers from keras.optimizers import Adam import numpy as np from PIL import Image from rl.core import Processor from rl.agents.dqn import DQNAgent from rl.memory import SequentialMemory from rl.policy import GreedyQPolicy create_q_model = __import__('train').create_q_model AtariProcessor = __import__('train').AtariProcessor if __name__ == '__main__': env = gym.make('BreakoutNoFrameskip-v4') state = env.reset() actions = env.action_space.n model = K.models.load_model('policy.h5') memory = SequentialMemory(limit=1000000, window_length=4) policy = GreedyQPolicy() process = AtariProcessor() dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, policy=policy, processor=process) dqn.compile(optimizer=Adam(lr=.00025, clipnorm=1.0), metrics=['mae']) dqn.test(env, nb_episodes=10, visualize=True)
processor = AtariProcessor() policy = policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1250000) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) #Prioritized Memories typically use lower learning rates dqn.compile(Adam(lr=.00025 / 4), metrics=['mae']) folder_path = '../model_saves/PDD/' if args.mode == 'train': weights_filename = folder_path + 'pdd_dqn_{}_weights.h5f'.format( args.env_name) checkpoint_weights_filename = folder_path + 'pdd_dqn_' + args.env_name + '_weights_{step}.h5f'
np.random.seed(42) env.seed(42) nb_actions = env.action_space.n window = 4 model = create_q_model(nb_actions, window) memory = SequentialMemory(limit=1000000, window_length=window) processor = Atari2DProcessor() policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) dqn = DQNAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, processor=processor, memory=memory, policy=policy) dqn.compile(K.optimizers.Adam(lr=.00025), metrics=['mae']) # load weights. dqn.load_weights('policy.h5') # evaluate algorithm for 10 episodes. dqn.test(env, nb_episodes=10, visualize=True)
value_test=.05, nb_steps=1000000) test_policy = EpsGreedyQPolicy(eps=0.05) if bool(args.double_dqn): print("DOUBLE DQN") if bool(args.dueling): print("DUELING NETWORK") dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, test_policy=test_policy, memory=memory, processor=processor, nb_steps_warmup=args.nb_steps_warmup, gamma=args.gamma, target_model_update=args.target_model_update, enable_double_dqn=bool(args.double_dqn), enable_dueling_network=bool(args.dueling), train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=args.learning_rate), metrics=['mae']) if args.mode == 'train': weights_filename = os.path.join( args.log_dir, 'dqn_{}_weights.h5f'.format(args.env_name)) checkpoint_weights_filename = os.path.join( args.log_dir, 'dqn_' + args.env_name + '_weights_{step}.h5f')
processor = AtariProcessor() #This is the important difference. Rather than using an E Greedy approach, where #we keep the network consistent but randomize the way we interpret its predictions, #in NoisyNet we are adding noise to the network and simply choosing the best value. policy = GreedyQPolicy() #N-step loss with n of 3 dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=10000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1., n_step=3, custom_model_objects={"NoisyNetDense": NoisyNetDense}) #Prioritized Memories typically use lower learning rates dqn.compile(Adam(lr=.00025 / 4), metrics=['mae']) folder_path = 'model_saves/' if args.mode == 'train': checkpoint_weights_filename = folder_path + 'advanced_dqn_' + args.env_name + '_weights_{step}.h5f' callbacks = [
print("[INFO] Building model...") print("Environment size is: rows: " + str(env.GAME_SIZE_ROWS) + " cols: " + str(env.GAME_SIZE_COLS)) print("Memory window length is: " + str(WINDOW_LENGTH)) input_rows = SokobanEnv.GAME_SIZE_ROWS input_cols = SokobanEnv.GAME_SIZE_COLS model = make_custom_model() model.summary() print("[INFO] Building DQNAgent...") basic_memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGTH) #action_choice_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=NUMBER_OF_STEPS_FOR_EXPLORATION) action_choice_policy = BoltzmannQPolicy(tau=1., clip=(-500., 500.)) dqn = DQNAgent(model=model, nb_actions=NUMBER_OF_POSSIBLE_ACTIONS, policy=action_choice_policy, memory=basic_memory, processor=bugfix_processor, batch_size=MEMORY_REPLAY_BATCH_SIZE, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=NUMBER_OF_STEPS_FOR_WARMUP, gamma=GAMMA, target_model_update=10000, train_interval=4, delta_clip=1.) opt = Adam(lr=.00025) # default #opt = Nadam(lr=.0005) #opt = SGD(lr=0.00025, momentum=0.9, nesterov=True) dqn.compile(optimizer=opt, metrics=['mae']) if DO_LOAD_WIEGHTS: print("[INFO] Loading weights from file: " + WEIGHTS_FILE_NAME) load_agent_weights(dqn, WEIGHTS_FILE_NAME) if DO_LOAD_GAMES_FROM_FILES: print("[INFO] Loading recored games. Thisa may take a while...") game_record_loader = SokobanManualGameMemoryLoader(agent_memory=basic_memory, memory_limit=LOADED_GAMES_MEMORY_LIMIT) game_record_loader.load_all_games()
def main(): """ Parses command line arguments, sets training environment parameters, creates deep Q-network and trains it on gym environment. """ parser = argparse.ArgumentParser( description="Simulation of drivers' behavior") parser.add_argument( '-f', '--fleet', help= 'Fleet sizes to simulate, formatted as comma-separated list (i.e. "-f 250,275,300")' ) parser.add_argument( '-m', '--multiplier', help= 'Surge multiplier, formatted as comma-separated list (i.e. "-m 1,1.5,2")' ) parser.add_argument('-b', '--bonus', type=int, help='Bonus') parser.add_argument('-d', '--demand', help='Percent false demand ') parser.add_argument( '-k', '--know', help= 'Percent knowing fare, formatted as comma-separated list (i.e. "-m 1,1.5,2") ' ) parser.add_argument( '-p', '--pro', help= 'Percent pro drivers, formatted as comma-separated list (i.e. "-p 1,1.5,2") ' ) parser.add_argument( '-av', '--av', help= 'Percent AV drivers, formatted as comma-separated list (i.e. "-av 1,1.5,2") ' ) parser.add_argument('-nb', '--nb', help='number of steps to train Rl ') args = parser.parse_args() if args.fleet: fleet_sizes = [int(x) for x in args.fleet.split(',')] # fleet_sizes = args.fleet else: fleet_sizes = FLEET_SIZE if args.multiplier: # surge = args.multiplier surges = [float(x) for x in args.multiplier.split(',')] else: surges = [SURGE_MULTIPLIER] if args.know: # surge = args.multiplier perc_know = [float(x) for x in args.know.split(',')] else: perc_know = [PERCE_KNOW] if args.bonus: bonus = args.bonus else: bonus = BONUS if args.pro: pro_share = [float(x) for x in args.pro.split(',')] else: pro_share = [PRO_SHARE] if args.demand: percent_false_demand = float(args.demand) else: percent_false_demand = PERCENT_FALSE_DEMAND if args.av: av_share = [float(x) for x in args.av.split(',')] else: av_share = [1] if args.nb: nb_steps = args.nb else: nb_steps = 300 for fleet_size in fleet_sizes: for surge in surges: for perc_k in perc_know: for pro_s in pro_share: m = Model(ZONE_IDS, DEMAND_SOURCE, WARMUP_TIME_HOUR, ANALYSIS_TIME_HOUR, fleet_size=fleet_size, pro_share=pro_s, surge_multiplier=surge, bonus=bonus, percent_false_demand=percent_false_demand, percentage_know_fare=perc_k) # make one veh to be AV veh = m.vehilcs[-1] veh.is_AV = True # env = RebalancingEnv(m, penalty=-0) nb_actions = env.action_space.n input_shape = (1, ) + env.state.shape input_dim = env.input_dim model = Sequential() model.add(Flatten(input_shape=input_shape)) model.add(Dense(256, activation='relu')) model.add(Dense(nb_actions, activation='linear')) memory = SequentialMemory(limit=2000, window_length=1) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy, gamma=.99) dqn.compile(Adam(lr=0.001, epsilon=0.05, decay=0.0), metrics=['mae']) dqn.fit(env, nb_steps=nb_steps, action_repetition=1, visualize=False, verbose=2) dqn.save_weights('new_dqn_weights_%s.h5f' % (nb_steps), overwrite=True)
policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=10000, ) # Defining our DQN dqn = DQNAgent( model=model, nb_actions=18, policy=policy, memory=memory, nb_steps_warmup=1000, gamma=0.5, target_model_update=1, delta_clip=0.01, enable_double_dqn=True, ) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # Training env_player.play_against( env_algorithm=dqn_training, opponent=third_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": NB_TRAINING_STEPS
def main(): ml_variables = FXU.getMLVariables() sqlEngine = FXU.getSQLEngine() reinforce_test_tablename = "reinforcetests" actions_table_details = { 'name': 'metaactions', 'col': ['Action', 'Time'], 'type': ['VARCHAR(20)', 'datetime'], 'null': [False, False] } ### Clear the actions table FXU.execute_query_db("DELETE FROM metaactions", sqlEngine) env = ForexEnv(type="train", inputSymbol="EURUSD", show_trade=True) env_test = ForexEnv(type="test", inputSymbol="EURUSD", show_trade=True) n_actions = env.action_space.n print("Number of actions : ", n_actions) model = create_model(shape=env.observation_space.shape, n_actions=n_actions) print(model.summary()) #### Configuring the agent memory = SequentialMemory(limit=100000, window_length=env.window_size) policy = EpsGreedyQPolicy() # enable the dueling network # you can specify the dueling_type to one of {'avg','max','naive'} dqn = DQNAgent(model=model, nb_actions=n_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-4), metrics=['mae']) minPortfolioThreshold = 0.4 training_episodes_n = int(ml_variables['TrainingEpisodesNumber']) ##### Load weights if available to resume previous learning if ml_variables['LoadWeights'] != 'no': model_file_name = "model\\dqnTrainingWeights_{0}.h5f".format( env_test.symbol) if ospath.isfile(model_file_name): print( "Weights for the previous session exist, so Going to load the weights" ) dqn.load_weights(model_file_name) while True: ####### Load the best weights if available #################### """ if ml_variables['LoadWeights'] != 'no': ##### Get from DB the best Profit ######################### rs = FXU.getTableRows_db( "SELECT * FROM {0} WHERE Symbol = '{1}' AND MinPortfolio > {2} ORDER BY TotalProfit DESC".format(reinforce_test_tablename, env_test.symbol, ( minPortfolioThreshold * env_test.starting_balance))) firstRow = -1 for row in rs: firstRow = row break if firstRow != -1: print("Best value : ", firstRow['TotalProfit']) model_file_name = "model\\duel_dqn_reward_{0}_{1}.h5f".format(env_test.symbol, int(firstRow['TotalProfit'])) if ospath.isfile(model_file_name): print("Weights for the best profit : {0} exist, so Going to load the weights".format(int(firstRow['TotalProfit']))) dqn.load_weights(model_file_name) """ # Train : dqn.fit(env, nb_steps=(env.split_point * training_episodes_n), nb_max_episode_steps=60000, visualize=False, verbose=2) dqn.save_weights('./model/dqnTrainingWeights_{0}.h5f'.format( env.symbol), overwrite=True) try: info = dqn.test(env_test, nb_episodes=1, visualize=False) #reward = info.history['episode_reward'] reward = env_test.portfolio - env_test.starting_balance print("Total Profit : ", reward) now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # if reward > int(max_reward) and int(reward) != 0 and env_test.minPortfolio > (minPortfolioThreshold * env_test.starting_balance): # max_reward = int(reward) #np.array([info.history]).dump('./info/duel_dqn_reward_{0}_{1}.info'.format(env_test.symbol, max_reward)) if reward > 500 and env_test.minPortfolio > ( minPortfolioThreshold * env_test.starting_balance): dqn.save_weights('./model/duel_dqn_reward_{0}_{1}.h5f'.format( env_test.symbol, int(reward)), overwrite=True) #print("Info of testing : ",info.history) FXU.execute_query_db( "INSERT INTO {0}(Symbol,StartingBalance,TotalProfit,Time,MinPortfolio) VALUES('{1}','{2}','{3}','{4}','{5}')" .format(reinforce_test_tablename, env_test.symbol, env_test.starting_balance, reward, now, env_test.minPortfolio), sqlEngine) #n_buys, n_lostBuys, n_sells, n_lostSells, portfolio = info['buys'], info['lostBuys'], info['sells'], info['lostBuys'] #np.array([info]).dump('./info/duel_dqn_{0}_weights_{1}LS_{2}_{3}.info'.format(env_test.symbol, portfolio, n_buys, n_sells)) except KeyboardInterrupt: return
def run_agent(agent): print("started new process") import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) WINDOW_LENGTH = 1 num_actions = 3 view_shape = (41, 41) input_shape = (WINDOW_LENGTH, ) + view_shape env = RestrictedViewTronEnv(agent, 20) model = Sequential() model.add(Permute((2, 3, 1), input_shape=input_shape)) model.add(Conv2D(16, (3, 3), padding="same")) model.add(Activation("relu")) model.add(Conv2D(32, (3, 3), padding="same")) model.add(Activation("relu")) model.add(Flatten()) model.add(Dense(256)) model.add(Activation("relu")) model.add(Dense(num_actions, activation="linear")) np.random.seed(1111) policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=2., value_min=.1, value_test=.1, nb_steps=1000000 // 10) processor = TronProcessor() memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) dqn = DQNAgent(model, nb_actions=num_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000 // 5, gamma=.9, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=["mae"]) weights_filename = 'tmp/dqn_test_weights.h5f' checkpoint_weights_filename = 'tmp/dqn_test_weights_{step}.h5f' log_filename = 'tmp/dqn_test_log.json' callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000 // 10) ] callbacks += [FileLogger(log_filename, interval=10000)] def train(transfer=False): print(dqn.get_config()) if transfer: dqn.load_weights(weights_filename) dqn.fit(env, callbacks=callbacks, nb_steps=1750000 // 10, log_interval=10000) dqn.save_weights(weights_filename, overwrite=True) dqn.test(env, nb_episodes=20, visualize=True) def opponent(): dqn.load_weights('tmp/dqn_test_weights.h5f') dqn.test(env, nb_episodes=200000, visualize=False) def test(): dqn.load_weights('tmp/dqn_test_weights.h5f') #dqn.load_weights('tmp/dqn-1/dqn_test_weights.h5f') dqn.test(env, nb_episodes=200, visualize=False) # uncomment for starting an 'opponent' agent #opponent() # uncomment for training an agent train() #True
def main(train_data, test_data, FOLDER): """ Initialization of all parameters, neural net, agent, training, validation and testing """ write_model_info( ) # save in a file the parameters you are using for this model # set up Environment and variables if METHOD == trailing: env = TrailEnv(FOLDER, STEPS, train_data, test_data, TEST_POINTS, limit_data=DATA_SIZE, one_hot=ONE_HOT, cost=COST, margin=MARGIN, turn=TURN, ce=CE, dp=DP, normalize_in=NORMALIZE_IN, reset_margin=RESET_FROM_MARGIN) else: env = DengEnv(FOLDER, STEPS, train_data, test_data, TEST_POINTS, window_in=WINDOW_LENGTH, limit_data=DATA_SIZE, one_hot=ONE_HOT, cost=COST_D) # set up the model model = set_model(env) #构建agent模型 memory = SequentialMemory( limit=MEM_SIZE, window_length=WINDOW_LENGTH) #用来存储experience, limit为最多存储多少个 # Exploration policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.0, value_min=0.1, value_test=0.05, nb_steps=EXPLORE_STEPS) # greedy Q策略中需要去平衡exploration与exploitation两者间关系,用这个函数来控制这个参数值 # value_max value_min指的是从最大值开始,衰减到最小值 # nb_steps:模型的训练步数 nb_actions = env.action_space.n # set up number of actions (outputs) # set up keras-rl agent dqn = DQNAgent(model=model, gamma=GAMMA, nb_actions=nb_actions, memory=memory, batch_size=BATCH_SIZE, nb_steps_warmup=1000, target_model_update=TAR_MOD_UP, policy=policy, delta_clip=DELTA_CLIP) # nb_steps_warmup: 在模型训练前保存多少个用于训练的batch大小 dqn.compile(Adam(lr=LR, decay=LR_DEC), metrics=['mse']) if START_FROM_TRAINED: dqn.load_weights(TRAINED_WEIGHTS) if VALIDATE: print('=====================1=======================') train_w_validation(env, dqn) else: print('=====================2=======================') train(env, dqn) fin_stats(env, STEPS) #统计多头 空头 test(env, dqn)
model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() # enable the dueling network # you can specify the dueling_type to one of {'avg','max','naive'} dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. # callbacks = [] # if model_checkpoints: # callbacks += [ # ModelIntervalCheckpoint( # './checkpoints/checkpoint_weights.h5f',
def __init__(self, number_of_training_steps=1e5, gamma=0.999, load_weights=False, visualize=False, dueling_network=True, double_dqn=True, nn_type='mlp', **kwargs): """ Agent constructor :param step_size: int, number of steps to take in env for a given simulation step :param window_size: int, number of lags to include in observation :param max_position: int, maximum number of positions able to be held in inventory :param fitting_file: str, file used for z-score fitting :param testing_file: str,file used for dqn experiment :param env: environment name :param seed: int, random seed number :param action_repeats: int, number of steps to take in environment between actions :param number_of_training_steps: int, number of steps to train agent for :param gamma: float, value between 0 and 1 used to discount future DQN returns :param format_3d: boolean, format observation as matrix or tensor :param train: boolean, train or test agent :param load_weights: boolean, import existing weights :param z_score: boolean, standardize observation space :param visualize: boolean, visiualize environment :param dueling_network: boolean, use dueling network architecture :param double_dqn: boolean, use double DQN for Q-value approximation """ # Agent arguments # self.env_name = id self.neural_network_type = nn_type self.load_weights = load_weights self.number_of_training_steps = number_of_training_steps self.visualize = visualize # Create environment self.env = gym.make(**kwargs) self.env_name = self.env.env.id # Create agent # NOTE: 'Keras-RL' uses its own frame-stacker self.memory_frame_stack = 1 # Number of frames to stack e.g., 1. self.model = self.create_model(name=self.neural_network_type) self.memory = SequentialMemory(limit=10000, window_length=self.memory_frame_stack) self.train = self.env.env.training self.cwd = os.path.dirname(os.path.realpath(__file__)) # create the agent self.agent = DQNAgent(model=self.model, nb_actions=self.env.action_space.n, memory=self.memory, processor=None, nb_steps_warmup=500, enable_dueling_network=dueling_network, dueling_type='avg', enable_double_dqn=double_dqn, gamma=gamma, target_model_update=1000, delta_clip=1.0) self.agent.compile(Adam(lr=float("3e-4")), metrics=['mae'])
model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Activation('relu')) model.add(Dense(5)) model.add(Activation('relu')) model.add(Dense(5)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy(tau=0.01) #policy = EpsGreedyQPolicy(eps=0.2) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy, gamma=0.5) #dqn = SarsaAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) #dqn = DDPGAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! If we visualize the training here for show, this # slows down training quite a lot. We can also always safely abort the training prematurely using # Ctrl + C. env.testing = False foo = dqn.fit(env, nb_steps=200, visualize=False, verbose = 0 ) d = datetime.utcnow() unixtime = calendar.timegm(d.utctimetuple()) ###save env.unwrapped.totalStates and env.unwrapped.actions as: [state,action] pairs with open('sartFeedback_' + str(unixtime) + '.csv', 'wb') as f: writer = csv.writer(f)
model.add(Dense(128, activation='relu')) model.add(Dense(env.action_space.n, activation='linear')) #set the policy for action selection policy = EpsGreedyQPolicy() #policy = MaxBoltzmannQPolicy() memory = SequentialMemory(limit=50000, window_length=1) #prints number of actions print("--------------------------------------") print(env.action_space.n) print("---------------------------------------") #deep q learning agent player1 = DQNAgent(model=model, nb_actions=env.action_space.n, enable_dueling_network=True, enable_double_dqn=True, memory=memory, target_model_update=1e-2, nb_steps_warmup=2000, policy=policy) #target_model_update=1e-2, #fits the model and saves it player1.compile(Adam(lr=1e-3), metrics=['mae']) player1.fit(env, action_repetition=3, nb_steps=10000, visualize=False) print(model.summary()) player1.save_weights(r'../models/mk_18.h5f', overwrite=True) #player1.test(env, nb_episodes=10, visualize=False)
def __init__(self, model, explorations, trainSize, validationSize, testSize, outputFile, begin, end, nbActions, isOnlyShort, ensembleFolderName, operationCost=0): self.isOnlyShort=isOnlyShort self.ensembleFolderName=ensembleFolderName #Define the policy, explorations, actions and model as received by parameters self.policy = EpsGreedyQPolicy() self.explorations=explorations self.nbActions=nbActions self.model=model #Define the memory self.memory = SequentialMemory(limit=10000, window_length=1) #Instantiate the agent with parameters received self.agent = DQNAgent(model=self.model, policy=self.policy, nb_actions=self.nbActions, memory=self.memory, nb_steps_warmup=200, target_model_update=1e-1, enable_double_dqn=True,enable_dueling_network=True) #Compile the agent with the adam optimizer and with the mean absolute error metric self.agent.compile(Adam(lr=1e-3), metrics=['mae']) #Save the weights of the agents in the q.weights file #Save random weights self.agent.save_weights("q.weights", overwrite=True) #Define the current starting point as the initial date self.currentStartingPoint = begin #Define the training, validation and testing size as informed by the call #Train: 5 years #Validation: 6 months #Test: 6 months self.trainSize=trainSize self.validationSize=validationSize self.testSize=testSize #The walk size is simply summing up the train, validation and test sizes self.walkSize=trainSize+validationSize+testSize #Define the ending point as the final date (January 1st of 2010) self.endingPoint=end #Read the hourly dataset #We join data from different files #Here hour data is read self.dates= pd.read_csv('./datasets/'+MK+'Hour.csv') self.sp = pd.read_csv('./datasets/'+MK+'Hour.csv') #Convert the pandas format to date and time format self.sp['Datetime'] = pd.to_datetime(self.sp['Date'] + ' ' + self.sp['Time']) #Set an index to Datetime on the pandas loaded dataset. Registers will be indexes through these values self.sp = self.sp.set_index('Datetime') #Drop Time and Date from the Dataset self.sp = self.sp.drop(['Time','Date'], axis=1) #Just the index considering date and time will be important, because date and time will be used to define the train, #validation and test for each walk self.sp = self.sp.index #Receives the operation cost, which is 0 #Operation cost is the cost for long and short. It is defined as zero self.operationCost = operationCost #Call the callback for training, validation and test in order to show results for each episode self.trainer=ValidationCallback() self.validator=ValidationCallback() self.tester=ValidationCallback() self.outputFileName=outputFile
def build_train_test(args, timesteps): # Get the environment and extract the number of actions. env = gym.make(args.env_name) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = Sequential() if K.image_dim_ordering() == 'tf': # (width, height, channels) model.add(Permute((2, 3, 1), input_shape=input_shape)) elif K.image_dim_ordering() == 'th': # (channels, width, height) model.add(Permute((1, 2, 3), input_shape=input_shape)) else: raise RuntimeError('Unknown image_dim_ordering.') model.add(Convolution2D(32, (8, 8), strides=(4, 4))) model.add(Activation('relu')) model.add(Convolution2D(64, (4, 4), strides=(2, 2))) model.add(Activation('relu')) model.add(Convolution2D(64, (3, 3), strides=(1, 1))) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=timesteps, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. result = dqn.test(env, nb_episodes=10, visualize=False) elif args.mode == 'test': weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) if args.weights: weights_filename = args.weights dqn.load_weights(weights_filename) result = dqn.test(env, nb_episodes=10, visualize=False) return result