Python DQNAgent.DQNAgent示例，rl.agents.dqn.DQNAgent.DQNAgent Python示例

示例#1

0

显示文件

memory = PriorityMemory(limit=MEMORY_SIZE,
                        minority_chance=MINORITY_CHANCE,
                        window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr="eps",
                              value_max=EPS_MAX,
                              value_min=EPS_MIN,
                              value_test=0.05,
                              nb_steps=EPS_STEPS)

dqn = DQNAgent(model=model,
               policy=policy,
               nb_actions=2,
               memory=memory,
               processor=processor,
               nb_steps_warmup=WARMUP_STEPS,
               gamma=GAMMA,
               target_model_update=TARGET_MODEL_UPDATE,
               train_interval=1,
               delta_clip=1,
               batch_size=BATCH_SIZE,
               enable_double_dqn=DOUBLE_DQN)
dqn.compile(Adam(lr=LR))

metrics = Metrics(X_val, y_val, interval=5_000)
dqn.fit(env,
        nb_steps=training_steps,
        log_interval=LOG_INTERVAL,
        callbacks=[metrics])
dqn.target_model.save(FP_MODEL)

# Validate on test dataset

示例#2

0

显示文件

        self.rewards[self.episode] = rw
        self.avgrewards[self.episode] = np.mean(self.rewardbuf)
        self.plot()
        self.episode += 1

    def plot(self):
        self.grphinst.set_ydata(self.rewards)
        self.grphavg.set_ydata(self.avgrewards)
        plt.draw()
        plt.pause(0.01)


dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=1e-2,
               policy=policy,
               enable_double_dqn=False)
dqn.compile(Adam(lr=0.002, decay=2.25e-05), metrics=['mse'])

cbs = [EpsDecayCallback(eps_poilcy=policy, decay_rate=0.975)]
cbs += [LivePlotCallback(nb_episodes=4000, avgwindow=20)]
dqn.fit(env, nb_steps=1000000, visualize=True, verbose=2, callbacks=cbs)

dqn.save_weights('monitor/dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# evaluate the algorithm for 100 episodes.
dqn.test(env, nb_episodes=100, visualize=True)

env.close()

示例#3

0

显示文件

    def run(self):
        #Initiates the environments, 
        trainEnv=validEnv=testEnv=" "
        
        iteration=-1

        #While we did not pass through all the dates (i.e., while all the walks were not finished)
        #walk size is train+validation+test size
        #currentStarting point begins with begin date
        while(self.currentStartingPoint+self.walkSize <= self.endingPoint):

            #Iteration is the current walk
            iteration+=1

            #Initiate the output file
            self.outputFile=open(self.outputFileName+str(iteration+1)+".csv", "w+")
            #write the first row of the csv
            self.outputFile.write(
                "Iteration,"+
                "trainAccuracy,"+
                "trainCoverage,"+
                "trainReward,"+
                "trainLong%,"+
                "trainShort%,"+
                "trainLongAcc,"+
                "trainShortAcc,"+
                "trainLongPrec,"+
                "trainShortPrec,"+

                "validationAccuracy,"+
                "validationCoverage,"+
                "validationReward,"+
                "validationLong%,"+
                "validationShort%,"+
                "validationLongAcc,"+
                "validationShortAcc,"+
                "validLongPrec,"+
                "validShortPrec,"+
                
                "testAccuracy,"+
                "testCoverage,"+
                "testReward,"+
                "testLong%,"+
                "testShort%,"+
                "testLongAcc,"+
                "testShortAcc,"+
                "testLongPrec,"+
                "testShortPrec\n")


            
            #Empty the memory and agent
            del(self.memory)
            del(self.agent)

            #Define the memory and agent
            #Memory is Sequential
            self.memory = SequentialMemory(limit=10000, window_length=1)
            #Agent is initiated as passed through parameters
            self.agent = DQNAgent(model=self.model, policy=self.policy,  nb_actions=self.nbActions, memory=self.memory, nb_steps_warmup=200, target_model_update=1e-1,
                                    enable_double_dqn=True,enable_dueling_network=True)
            #Compile the agent with Adam initialization
            self.agent.compile(Adam(lr=1e-3), metrics=['mae'])
            
            #Load the weights saved before in a random way if it is the first time
            self.agent.load_weights("q.weights")
            
            ########################################TRAINING STAGE########################################################
            
            #The TrainMinLimit will be loaded as the initial date at the beginning, and will be updated later.
            #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date    
            trainMinLimit=None
            while(trainMinLimit is None):
                try:
                    trainMinLimit = self.sp.get_loc(self.currentStartingPoint)
                except:
                    self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0)

            #The TrainMaxLimit will be loaded as the interval between the initial date plus the training size.
            #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date    
            trainMaxLimit=None
            while(trainMaxLimit is None):
                try:
                    trainMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize)
                except:
                    self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0)
            
            ########################################VALIDATION STAGE#######################################################
            #The ValidMinLimit will be loaded as the next element of the TrainMax limit
            validMinLimit=trainMaxLimit+1

            #The ValidMaxLimit will be loaded as the interval after the begin + train size +validation size
            #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date  
            validMaxLimit=None
            while(validMaxLimit is None):
                try:
                    validMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize+self.validationSize)
                except:
                    self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0)

            ########################################TESTING STAGE######################################################## 
            #The TestMinLimit will be loaded as the next element of ValidMaxlimit 
            testMinLimit=validMaxLimit+1

            #The testMaxLimit will be loaded as the interval after the begin + train size +validation size + Testsize
            #If the initial date cannot be used, add 1 hour to the initial date and consider it the initial date 
            testMaxLimit=None
            while(testMaxLimit is None):
                try:
                    testMaxLimit = self.sp.get_loc(self.currentStartingPoint+self.trainSize+self.validationSize+self.testSize)
                except:
                    self.currentStartingPoint+=datetime.timedelta(0,0,0,0,0,1,0)

            #Separate the Validation and testing data according to the limits found before
            #Prepare the training and validation files for saving them later 
            ensambleValid=pd.DataFrame(index=self.dates[validMinLimit:validMaxLimit].ix[:,'Date'].drop_duplicates().tolist())
            ensambleTest=pd.DataFrame(index=self.dates[testMinLimit:testMaxLimit].ix[:,'Date'].drop_duplicates().tolist())
            
            #Put the name of the index for validation and testing
            ensambleValid.index.name='Date'
            ensambleTest.index.name='Date'
            
            #Explorations are epochs considered, or how many times the agent will play the game.  
            for eps in self.explorations:

                #policy will be 0.2, so the randomness of predictions (actions) will happen with 20% of probability 
                self.policy.eps = eps[0]
                
                #there will be 100 iterations (epochs), or eps[1])
                for i in range(0,eps[1]):
                    
                    del(trainEnv)

                    #Define the training, validation and testing environments with their respective callbacks
                    trainEnv = SpEnv(operationCost=self.operationCost,minLimit=trainMinLimit,maxLimit=trainMaxLimit,callback=self.trainer,isOnlyShort=self.isOnlyShort)
                    del(validEnv)
                    validEnv=SpEnv(operationCost=self.operationCost,minLimit=validMinLimit,maxLimit=validMaxLimit,callback=self.validator,isOnlyShort=self.isOnlyShort,ensamble=ensambleValid,columnName="iteration"+str(i))
                    del(testEnv)
                    testEnv=SpEnv(operationCost=self.operationCost,minLimit=testMinLimit,maxLimit=testMaxLimit,callback=self.tester,isOnlyShort=self.isOnlyShort,ensamble=ensambleTest,columnName="iteration"+str(i))

                    #Reset the callback
                    self.trainer.reset()
                    self.validator.reset()
                    self.tester.reset()

                    #Reset the training environment
                    trainEnv.resetEnv()
                    #Train the agent
                    self.agent.fit(trainEnv,nb_steps=floor(self.trainSize.days-self.trainSize.days*0.2),visualize=False,verbose=0)
                    #Get the info from the train callback
                    (_,trainCoverage,trainAccuracy,trainReward,trainLongPerc,trainShortPerc,trainLongAcc,trainShortAcc,trainLongPrec,trainShortPrec)=self.trainer.getInfo()
                    #Print Callback values on the screen
                    print(str(i) + " TRAIN:  acc: " + str(trainAccuracy)+ " cov: " + str(trainCoverage)+ " rew: " + str(trainReward))

                    #Reset the validation environment
                    validEnv.resetEnv()
                    #Test the agent on validation data
                    self.agent.test(validEnv,nb_episodes=floor(self.validationSize.days-self.validationSize.days*0.2),visualize=False,verbose=0)
                    #Get the info from the validation callback
                    (_,validCoverage,validAccuracy,validReward,validLongPerc,validShortPerc,validLongAcc,validShortAcc,validLongPrec,validShortPrec)=self.validator.getInfo()
                    #Print callback values on the screen
                    print(str(i) + " VALID:  acc: " + str(validAccuracy)+ " cov: " + str(validCoverage)+ " rew: " + str(validReward))

                    #Reset the testing environment
                    testEnv.resetEnv()
                    #Test the agent on testing data
                    self.agent.test(testEnv,nb_episodes=floor(self.validationSize.days-self.validationSize.days*0.2),visualize=False,verbose=0)
                    #Get the info from the testing callback
                    (_,testCoverage,testAccuracy,testReward,testLongPerc,testShortPerc,testLongAcc,testShortAcc,testLongPrec,testShortPrec)=self.tester.getInfo()
                    #Print callback values on the screen
                    print(str(i) + " TEST:  acc: " + str(testAccuracy)+ " cov: " + str(testCoverage)+ " rew: " + str(testReward))
                    print(" ")
                    
                    #write the walk data on the text file
                    self.outputFile.write(
                        str(i)+","+
                        str(trainAccuracy)+","+
                        str(trainCoverage)+","+
                        str(trainReward)+","+
                        str(trainLongPerc)+","+
                        str(trainShortPerc)+","+
                        str(trainLongAcc)+","+
                        str(trainShortAcc)+","+
                        str(trainLongPrec)+","+
                        str(trainShortPrec)+","+
                        
                        str(validAccuracy)+","+
                        str(validCoverage)+","+
                        str(validReward)+","+
                        str(validLongPerc)+","+
                        str(validShortPerc)+","+
                        str(validLongAcc)+","+
                        str(validShortAcc)+","+
                        str(validLongPrec)+","+
                        str(validShortPrec)+","+
                        
                        str(testAccuracy)+","+
                        str(testCoverage)+","+
                        str(testReward)+","+
                        str(testLongPerc)+","+
                        str(testShortPerc)+","+
                        str(testLongAcc)+","+
                        str(testShortAcc)+","+
                        str(testLongPrec)+","+
                        str(testShortPrec)+"\n")

            #Close the file                
            self.outputFile.close()

            #For the next walk, the current starting point will be the current starting point + the test size
            #It means that, for the next walk, the training data will start 6 months after the training data of 
            #the previous walk   
            self.currentStartingPoint+=self.testSize

            #Write validation and Testing data into files
            #Save the files for processing later with the ensemble considering the 100 epochs
            ensambleValid.to_csv("./Output/ensemble/"+self.ensembleFolderName+"/walk"+str(iteration)+"ensemble_valid.csv")
            ensambleTest.to_csv("./Output/ensemble/"+self.ensembleFolderName+"/walk"+str(iteration)+"ensemble_test.csv")

示例#4

0

显示文件

policy = EpsGreedyQPolicy(eps=eps)
policy = LinearAnnealedPolicy(policy,
                              attr='eps',
                              value_max=eps,
                              value_min=0,
                              value_test=0,
                              nb_steps=nb_steps)
test_policy = GreedyQPolicy()

####################################################################

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=window_length + batch_size,
               target_model_update=0.02,
               policy=policy,
               test_policy=test_policy,
               batch_size=batch_size,
               train_interval=train_interval,
               gamma=gamma)

dqn.compile(Adam(lr=0.00025), metrics=['mae'])

dqn.load_weights('dqn_{}_weights.h5f'.format('lunar'))

####################################################################

nb_episodes = 10

history = dqn.test(env, nb_episodes=nb_episodes)

示例#5

0

显示文件

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1, value_min=.1, value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

if args.mode == 'train':
    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that you can use the built-in Keras callbacks!
    os.chdir(DIR)
    os.chdir('runs')
    if not args.run_id:
        run_id = '{}_{}'.format(args.env_name, time())
    else: 
        run_id = args.run_id
    os.mkdir(run_id)
    weights_filename = '{}/weights.h5f'.format(run_id)
    checkpoint_weights_filename = weights_filename + '_{step}.h5f'

示例#6

0

显示文件

model.add(Dense(512, activation='linear'))
model.add(LeakyReLU(alpha=.001))
model.add(Dense(1024, activation='linear'))
model.add(LeakyReLU(alpha=.001))
model.add(Dense(512, activation='linear'))
model.add(LeakyReLU(alpha=.001))
model.add(Dense(nb_actions))
model.add(Activation('linear'))

policy = EpsGreedyQPolicy()

memory = SequentialMemory(limit=10000, window_length=50)
dqn = DQNAgent(model=model,
               policy=policy,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=400,
               target_model_update=1e-1,
               enable_double_dqn=True,
               enable_dueling_network=True)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

outputFile = open("2105.csv", "w+")
outputFile.write(
    "iteration,trainAccuracy,trainCoverage,trainReward,validationAccuracy,validationCoverage,validationReward\n"
)
iteration = 0

for i in range(0, 100):
    dqn.fit(trainEnv,
            nb_steps=3000,
            visualize=False,

示例#7

0

显示文件

model.add(Dense(len(env.VoltageMag)))
model.add(Activation('relu'))
model.add(Dense(env.action_space.n))
model.add(Activation('linear'))
print(model.summary())
#
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1,
                              value_min=1e-2,
                              value_test=.05,
                              nb_steps=20000)
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model,
               nb_actions=env.action_space.n,
               memory=memory,
               nb_steps_warmup=1000,
               target_model_update=1e-3,
               policy=policy)

dqn.compile(Adam(lr=1e-1), metrics=['mae'])
#
# sys.stdout = open(currentDirectory + "/test4.txt", "w")
history = dqn.fit(env, nb_steps=20000, verbose=2)
# sys.stdout.close()

# Save model
dqn.model.save(currentDirectory + "/saved_model")

print("Saved model to disk")

# Enjoy trained agent

示例#8

0

显示文件

def training_game():
    env = Environment(
        map_name="ForceField",
        visualize=True,
        game_steps_per_episode=150,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=64, minimap=32)))

    input_shape = (_SIZE, _SIZE, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.2,
                                  value_test=.0,
                                  nb_steps=1e2)

    # Agent

    dqn = DQNAgent(
        model=model,
        nb_actions=nb_actions,
        memory=memory,
        enable_double_dqn=True,
        enable_dueling_network=True,
        # 2019-07-12 GU Zhan (Sam)
        #                   nb_steps_warmup=500, target_model_update=1e-2, policy=policy,
        nb_steps_warmup=1500,
        target_model_update=1e-2,
        policy=policy,
        batch_size=150,
        processor=processor,
        delta_clip=1)

    dqn.compile(Adam(lr=.001), metrics=["mae", "acc"])

    # Tensorboard callback

    timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}"
    # 2019-07-12 GU Zhan (Sam) folder name for Lunux:
    #    callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0,
    #                                write_graph=True, write_images=False)

    # 2019-07-12 GU Zhan (Sam) folder name for Windows:
    callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz',
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)

    # Save the parameters and upload them when needed

    name = "agent"
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    class Saver(Callback):
        def on_episode_end(self, episode, logs={}):
            if episode % 200 == 0:
                self.model.save_weights(w_file, overwrite=True)

    s = Saver()
    logs = FileLogger('DQN_Agent_log.csv', interval=1)

    #    dqn.fit(env, callbacks=[callbacks,s,logs], nb_steps=600, action_repetition=2,
    dqn.fit(env,
            callbacks=[callbacks, s, logs],
            nb_steps=3000,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)

示例#9

0

显示文件

文件： bot-rl_v0.2.py 项目： tau-lex/market-analysis-system

## Init ML-model for agent
limit = observation_shape[1]
model = cnn_model_2in_with_feedback(
    (limit, 4), (limit, 4), market.feedback_shape, nb_actions, 'softmax')

## Init RL-metod parameters
memory = SequentialMemory(limit=10000, window_length=1)
# TODO implement policies for multiply symbols
policy = BoltzmannQPolicy()

## Init RL agent
agent = DQNAgent(model=model,
                 nb_actions=nb_actions,
                 memory=memory,
                 nb_steps_warmup=1000,
                 target_model_update=1e-2,
                 policy=policy,
                 processor=MultiInputProcessor(3),
                 enable_dueling_network=True,
                 dueling_type='avg')
agent.compile(Adam(lr=1e-3), metrics=['mae'])

try:
    ## Comment here if you want to start learning again
    agent.load_weights('{p}/dqn_{fn}_weights.h5f'.format(p=PATH, fn=ENV_NAME))
    pass
except OSError as e:
    print(e)
except ValueError as e:
    print(e)

示例#10

0

显示文件

文件： EnvSim.py 项目： rbauld/TDI-capstone

env.BitInt.step = step_size
n_steps = end_buffer / step_size
gamma = 0.95

# Do prefitting
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1,
                              value_min=0.05,
                              value_test=0,
                              nb_steps=100000)
agent = DQNAgent(model=actor,
                 nb_actions=nb_actions,
                 policy=policy,
                 memory=memory,
                 test_policy=policy,
                 nb_steps_warmup=50,
                 gamma=gamma,
                 target_model_update=10000,
                 train_interval=4,
                 delta_clip=1.)
agent.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])
agent.fit(env,
          nb_steps=100000,
          visualize=False,
          verbose=1,
          nb_max_episode_steps=n_steps,
          log_interval=1000)

# Do early stopping fit

test_scores = []

示例#11

0

显示文件

文件： TradeTester.py 项目： rahulpradeep4218/Model5FX-RL

def main():
    ml_variables = FXU.getMLVariables()
    actions_table_details = {
        'name': 'metaactions',
        'col': ['Action', 'Time'],
        'type': ['VARCHAR(20)', 'datetime'],
        'null': [False, False]
    }
    ### Clear the actions table

    FXU.execute_query_db("DELETE FROM metaactions")
    env = ForexEnv(type="train", inputSymbol="EURUSD", show_trade=True)
    env_test = ForexEnv(type="test", inputSymbol="EURUSD", show_trade=True)

    n_actions = env.action_space.n
    print("Number of actions : ", n_actions)
    model = create_model(shape=env.observation_space.shape,
                         n_actions=n_actions)
    print(model.summary())

    #### Configuring the agent
    memory = SequentialMemory(limit=100000, window_length=env.window_size)
    policy = EpsGreedyQPolicy()

    # enable the dueling network
    # you can specify the dueling_type to one of {'avg','max','naive'}
    dqn = DQNAgent(model=model,
                   nb_actions=n_actions,
                   memory=memory,
                   nb_steps_warmup=100,
                   enable_dueling_network=True,
                   dueling_type='naive',
                   target_model_update=1e-2,
                   policy=policy)
    dqn.compile(Adam(lr=1e-4), metrics=['mse'])
    if ml_variables['LoadWeights'] != 'no':
        path = 'model\\' + ml_variables['LoadWeights'] + ".h5f"
        if ospath.isfile(path):
            print("Weights exist, so Going to load the weights")
            dqn.load_weights(path)
    max_reward = -1000000

    while True:

        # Train :
        dqn.fit(env,
                nb_steps=env.split_point,
                nb_max_episode_steps=60000,
                visualize=False,
                verbose=2)

        try:
            info = dqn.test(env_test, nb_episodes=1, visualize=False)
            #reward = info.history['episode_reward']
            reward = env_test.balance - env_test.starting_balance
            print("reward : ", reward)
            if reward > int(max_reward) and int(reward) != 0:
                max_reward = int(reward)
                np.array([info.history
                          ]).dump('./info/duel_dqn_reward_{0}_{1}.info'.format(
                              env_test.symbol, max_reward))
                dqn.save_weights('./model/duel_dqn_reward_{0}_{1}.h5f'.format(
                    env_test.symbol, max_reward))
            #print("Info of testing : ",info.history)

            #n_buys, n_lostBuys, n_sells, n_lostSells, portfolio = info['buys'], info['lostBuys'], info['sells'], info['lostBuys']
            #np.array([info]).dump('./info/duel_dqn_{0}_weights_{1}LS_{2}_{3}.info'.format(env_test.symbol, portfolio, n_buys, n_sells))
        except KeyboardInterrupt:
            return

        ##### Saving weights after each fitting to resume afterwards ###############
        if ml_variables['LoadWeights'] != 'no':
            dqn.save_weights(filepath='model\\' + ml_variables['LoadWeights'] +
                             ".h5f",
                             overwrite=True)

示例#12

0

显示文件

文件： test_slither.py 项目： AlmightyYakob/deep_slither

from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import ModelIntervalCheckpoint

from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam

env = gym.make("slitherio-v0", headless=False, width=500, height=500)
# model = conv_model(env)
model = full_combined_conv_lstm_model(env)
# model = load_model("current_model.h5")
print(model.weights)
model.load_weights("new_current_model.h5")
print(model.weights)
policy = EpsGreedyQPolicy(eps=0.2)
memory = SequentialMemory(limit=DQN_MEMORY_SIZE, window_length=1)
dqn = DQNAgent(
    model=model,
    nb_actions=env.action_space.n,
    memory=memory,
    # nb_steps_warmup=DQN_MEMORY_SIZE,
    target_model_update=1e-2,
    policy=policy,
    processor=LSTMProcessor(),
)
dqn.compile(Adam(lr=1e-3), metrics=["mae"])

dqn.test(env, nb_episodes=5, visualize=True)

示例#13

0

显示文件

def main(args):

    base = os.path.expanduser(args.data)
    if not os.path.exists(base):
        os.makedirs(base)
    if not os.path.isdir(base):
        raise Exception("Specified data directory is not a directory.")

    alias = "dqn-%s-%s-%s-%d-%d-%d" % (args.reward, args.activation,
                                       args.policy, args.neurons,
                                       args.interval, args.memlength)

    logfile = None
    weightfile = None
    logfile = os.path.join(base, "%s_log.tsv" % alias)
    weightsfile = os.path.join(base, "%s_weights.h5f" % alias)

    env = AutoturnSF_Env(alias,
                         4,
                         visualize=args.visualize,
                         reward=args.reward,
                         port=args.port)

    nb_actions = env.action_space.n

    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(args.neurons))
    model.add(get_activation(args.activation))
    model.add(Dense(args.neurons))
    model.add(get_activation(args.activation))
    model.add(Dense(args.neurons))
    model.add(get_activation(args.activation))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    STEPS_PER_EPISODE = env.get_max_frames() / args.frameskip

    memory = SequentialMemory(limit=STEPS_PER_EPISODE * 500)
    #policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=.5, value_min=.05, value_test=.01, nb_steps=STEPS_PER_EPISODE*1000)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=.05,
                                  value_min=.001,
                                  value_test=.001,
                                  nb_steps=STEPS_PER_EPISODE * 1000)
    agent = DQNAgent(model=model,
                     nb_actions=nb_actions,
                     policy=policy,
                     window_length=1,
                     memory=memory,
                     train_interval=args.interval,
                     nb_steps_warmup=STEPS_PER_EPISODE * 1,
                     gamma=.99,
                     target_model_update=STEPS_PER_EPISODE * 1)
    agent.compile(Adam(lr=.0001), metrics=['mae'])
    if args.weights != None and os.path.isfile(args.weights):
        print("Loading weights from file: %s" % args.weights)
        agent.load_weights(args.weights)
    log = TrainEpisodeFileLogger(env, agent, logfile, weightsfile)
    agent.fit(env,
              nb_steps=STEPS_PER_EPISODE * 10000,
              visualize=True,
              verbose=2,
              action_repetition=args.frameskip,
              callbacks=[log])

示例#14

0

显示文件

"""
import gym
import h5py
import keras as K
from keras import layers
from keras.optimizers import Adam
import numpy as np
from PIL import Image
from rl.core import Processor
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import GreedyQPolicy
create_q_model = __import__('train').create_q_model
AtariProcessor = __import__('train').AtariProcessor

if __name__ == '__main__':
    env = gym.make('BreakoutNoFrameskip-v4')
    state = env.reset()
    actions = env.action_space.n
    model = K.models.load_model('policy.h5')
    memory = SequentialMemory(limit=1000000, window_length=4)
    policy = GreedyQPolicy()
    process = AtariProcessor()
    dqn = DQNAgent(model=model,
                   nb_actions=actions,
                   memory=memory,
                   policy=policy,
                   processor=process)
    dqn.compile(optimizer=Adam(lr=.00025, clipnorm=1.0), metrics=['mae'])
    dqn.test(env, nb_episodes=10, visualize=True)

示例#15

0

显示文件

processor = AtariProcessor()

policy = policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                       attr='eps',
                                       value_max=1.,
                                       value_min=.1,
                                       value_test=.05,
                                       nb_steps=1250000)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               enable_double_dqn=True,
               enable_dueling_network=True,
               nb_steps_warmup=50000,
               gamma=.99,
               target_model_update=10000,
               train_interval=4,
               delta_clip=1.)

#Prioritized Memories typically use lower learning rates
dqn.compile(Adam(lr=.00025 / 4), metrics=['mae'])

folder_path = '../model_saves/PDD/'

if args.mode == 'train':
    weights_filename = folder_path + 'pdd_dqn_{}_weights.h5f'.format(
        args.env_name)
    checkpoint_weights_filename = folder_path + 'pdd_dqn_' + args.env_name + '_weights_{step}.h5f'

示例#16

0

显示文件

    np.random.seed(42)
    env.seed(42)
    nb_actions = env.action_space.n
    window = 4

    model = create_q_model(nb_actions, window)
    memory = SequentialMemory(limit=1000000, window_length=window)
    processor = Atari2DProcessor()

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=1.,
                                  value_min=.1,
                                  value_test=.05,
                                  nb_steps=1000000)

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=10,
                   processor=processor,
                   memory=memory,
                   policy=policy)

    dqn.compile(K.optimizers.Adam(lr=.00025), metrics=['mae'])

    # load weights.
    dqn.load_weights('policy.h5')

    # evaluate algorithm for 10 episodes.
    dqn.test(env, nb_episodes=10, visualize=True)

示例#17

0

显示文件

文件： dqn_atari.py 项目： christopher-hsu/ray

                                  value_test=.05,
                                  nb_steps=1000000)
    test_policy = EpsGreedyQPolicy(eps=0.05)

    if bool(args.double_dqn):
        print("DOUBLE DQN")
    if bool(args.dueling):
        print("DUELING NETWORK")

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   policy=policy,
                   test_policy=test_policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=args.nb_steps_warmup,
                   gamma=args.gamma,
                   target_model_update=args.target_model_update,
                   enable_double_dqn=bool(args.double_dqn),
                   enable_dueling_network=bool(args.dueling),
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(Adam(lr=args.learning_rate), metrics=['mae'])

    if args.mode == 'train':

        weights_filename = os.path.join(
            args.log_dir, 'dqn_{}_weights.h5f'.format(args.env_name))
        checkpoint_weights_filename = os.path.join(
            args.log_dir, 'dqn_' + args.env_name + '_weights_{step}.h5f')

示例#18

0

显示文件

processor = AtariProcessor()

#This is the important difference. Rather than using an E Greedy approach, where
#we keep the network consistent but randomize the way we interpret its predictions,
#in NoisyNet we are adding noise to the network and simply choosing the best value.
policy = GreedyQPolicy()

#N-step loss with n of 3
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               enable_double_dqn=True,
               enable_dueling_network=True,
               nb_steps_warmup=10000,
               gamma=.99,
               target_model_update=10000,
               train_interval=4,
               delta_clip=1.,
               n_step=3,
               custom_model_objects={"NoisyNetDense": NoisyNetDense})

#Prioritized Memories typically use lower learning rates
dqn.compile(Adam(lr=.00025 / 4), metrics=['mae'])

folder_path = 'model_saves/'

if args.mode == 'train':
    checkpoint_weights_filename = folder_path + 'advanced_dqn_' + args.env_name + '_weights_{step}.h5f'
    callbacks = [

示例#19

0

显示文件

文件： TestDQN.py 项目： banangrg/rl-sokoban

    print("[INFO] Building model...")
    print("Environment size is: rows: " + str(env.GAME_SIZE_ROWS) + " cols: " + str(env.GAME_SIZE_COLS))
    print("Memory window length is: " + str(WINDOW_LENGTH))
    input_rows = SokobanEnv.GAME_SIZE_ROWS
    input_cols = SokobanEnv.GAME_SIZE_COLS
    model = make_custom_model()
    model.summary()

    print("[INFO] Building DQNAgent...")
    basic_memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGTH)

    #action_choice_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=NUMBER_OF_STEPS_FOR_EXPLORATION)
    action_choice_policy = BoltzmannQPolicy(tau=1., clip=(-500., 500.))

    dqn = DQNAgent(model=model, nb_actions=NUMBER_OF_POSSIBLE_ACTIONS, policy=action_choice_policy, memory=basic_memory,
                   processor=bugfix_processor, batch_size=MEMORY_REPLAY_BATCH_SIZE,
                   enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=NUMBER_OF_STEPS_FOR_WARMUP,
                   gamma=GAMMA, target_model_update=10000, train_interval=4, delta_clip=1.)

    opt = Adam(lr=.00025)       # default
    #opt = Nadam(lr=.0005)
    #opt = SGD(lr=0.00025, momentum=0.9, nesterov=True)
    dqn.compile(optimizer=opt, metrics=['mae'])

    if DO_LOAD_WIEGHTS:
        print("[INFO] Loading weights from file: " + WEIGHTS_FILE_NAME)
        load_agent_weights(dqn, WEIGHTS_FILE_NAME)

    if DO_LOAD_GAMES_FROM_FILES:
        print("[INFO] Loading recored games. Thisa may take a while...")
        game_record_loader = SokobanManualGameMemoryLoader(agent_memory=basic_memory, memory_limit=LOADED_GAMES_MEMORY_LIMIT)
        game_record_loader.load_all_games()

示例#20

0

显示文件

def main():
    """
    Parses command line arguments, sets training environment parameters, creates deep Q-network and trains it
    on gym environment.
    """
    parser = argparse.ArgumentParser(
        description="Simulation of drivers' behavior")
    parser.add_argument(
        '-f',
        '--fleet',
        help=
        'Fleet sizes to simulate, formatted as comma-separated list (i.e. "-f 250,275,300")'
    )
    parser.add_argument(
        '-m',
        '--multiplier',
        help=
        'Surge multiplier, formatted as comma-separated list (i.e. "-m 1,1.5,2")'
    )
    parser.add_argument('-b', '--bonus', type=int, help='Bonus')
    parser.add_argument('-d', '--demand', help='Percent false demand ')
    parser.add_argument(
        '-k',
        '--know',
        help=
        'Percent knowing fare, formatted as comma-separated list (i.e. "-m 1,1.5,2") '
    )
    parser.add_argument(
        '-p',
        '--pro',
        help=
        'Percent pro drivers, formatted as comma-separated list (i.e. "-p 1,1.5,2") '
    )
    parser.add_argument(
        '-av',
        '--av',
        help=
        'Percent AV drivers, formatted as comma-separated list (i.e. "-av 1,1.5,2") '
    )
    parser.add_argument('-nb', '--nb', help='number of steps to train Rl ')

    args = parser.parse_args()
    if args.fleet:
        fleet_sizes = [int(x) for x in args.fleet.split(',')]
    #        fleet_sizes = args.fleet
    else:
        fleet_sizes = FLEET_SIZE

    if args.multiplier:
        # surge = args.multiplier
        surges = [float(x) for x in args.multiplier.split(',')]
    else:
        surges = [SURGE_MULTIPLIER]

    if args.know:
        # surge = args.multiplier
        perc_know = [float(x) for x in args.know.split(',')]
    else:
        perc_know = [PERCE_KNOW]

    if args.bonus:
        bonus = args.bonus
    else:
        bonus = BONUS

    if args.pro:

        pro_share = [float(x) for x in args.pro.split(',')]
    else:
        pro_share = [PRO_SHARE]

    if args.demand:
        percent_false_demand = float(args.demand)
    else:
        percent_false_demand = PERCENT_FALSE_DEMAND

    if args.av:
        av_share = [float(x) for x in args.av.split(',')]
    else:
        av_share = [1]
    if args.nb:
        nb_steps = args.nb
    else:
        nb_steps = 300

    for fleet_size in fleet_sizes:
        for surge in surges:
            for perc_k in perc_know:
                for pro_s in pro_share:
                    m = Model(ZONE_IDS,
                              DEMAND_SOURCE,
                              WARMUP_TIME_HOUR,
                              ANALYSIS_TIME_HOUR,
                              fleet_size=fleet_size,
                              pro_share=pro_s,
                              surge_multiplier=surge,
                              bonus=bonus,
                              percent_false_demand=percent_false_demand,
                              percentage_know_fare=perc_k)

                    # make one veh to be AV
                    veh = m.vehilcs[-1]
                    veh.is_AV = True
                    #
                    env = RebalancingEnv(m, penalty=-0)

                    nb_actions = env.action_space.n
                    input_shape = (1, ) + env.state.shape
                    input_dim = env.input_dim

                    model = Sequential()
                    model.add(Flatten(input_shape=input_shape))
                    model.add(Dense(256, activation='relu'))
                    model.add(Dense(nb_actions, activation='linear'))

                    memory = SequentialMemory(limit=2000, window_length=1)
                    policy = EpsGreedyQPolicy()
                    dqn = DQNAgent(model=model,
                                   nb_actions=nb_actions,
                                   memory=memory,
                                   nb_steps_warmup=100,
                                   target_model_update=1e-2,
                                   policy=policy,
                                   gamma=.99)
                    dqn.compile(Adam(lr=0.001, epsilon=0.05, decay=0.0),
                                metrics=['mae'])

                    dqn.fit(env,
                            nb_steps=nb_steps,
                            action_repetition=1,
                            visualize=False,
                            verbose=2)
                    dqn.save_weights('new_dqn_weights_%s.h5f' % (nb_steps),
                                     overwrite=True)

示例#21

0

显示文件

文件： rl_3h_t1-2k.py 项目： wsepesi/poke-env

    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0,
        nb_steps=10000,
    )

    # Defining our DQN
    dqn = DQNAgent(
        model=model,
        nb_actions=18,
        policy=policy,
        memory=memory,
        nb_steps_warmup=1000,
        gamma=0.5,
        target_model_update=1,
        delta_clip=0.01,
        enable_double_dqn=True,
    )

    dqn.compile(Adam(lr=0.00025), metrics=["mae"])

    # Training
    env_player.play_against(
        env_algorithm=dqn_training,
        opponent=third_opponent,
        env_algorithm_kwargs={
            "dqn": dqn,
            "nb_steps": NB_TRAINING_STEPS

示例#22

0

显示文件

文件： TradeTester.py 项目： rahulpradeep4218/ModelFX-RL-LSTM

def main():
    ml_variables = FXU.getMLVariables()
    sqlEngine = FXU.getSQLEngine()
    reinforce_test_tablename = "reinforcetests"
    actions_table_details = {
        'name': 'metaactions',
        'col': ['Action', 'Time'],
        'type': ['VARCHAR(20)', 'datetime'],
        'null': [False, False]
    }
    ### Clear the actions table

    FXU.execute_query_db("DELETE FROM metaactions", sqlEngine)
    env = ForexEnv(type="train", inputSymbol="EURUSD", show_trade=True)
    env_test = ForexEnv(type="test", inputSymbol="EURUSD", show_trade=True)

    n_actions = env.action_space.n
    print("Number of actions : ", n_actions)
    model = create_model(shape=env.observation_space.shape,
                         n_actions=n_actions)
    print(model.summary())

    #### Configuring the agent
    memory = SequentialMemory(limit=100000, window_length=env.window_size)
    policy = EpsGreedyQPolicy()

    # enable the dueling network
    # you can specify the dueling_type to one of {'avg','max','naive'}
    dqn = DQNAgent(model=model,
                   nb_actions=n_actions,
                   memory=memory,
                   nb_steps_warmup=100,
                   enable_dueling_network=True,
                   dueling_type='avg',
                   target_model_update=1e-2,
                   policy=policy)
    dqn.compile(Adam(lr=1e-4), metrics=['mae'])

    minPortfolioThreshold = 0.4

    training_episodes_n = int(ml_variables['TrainingEpisodesNumber'])

    ##### Load weights if available to resume previous learning
    if ml_variables['LoadWeights'] != 'no':
        model_file_name = "model\\dqnTrainingWeights_{0}.h5f".format(
            env_test.symbol)

        if ospath.isfile(model_file_name):
            print(
                "Weights for the previous session exist, so Going to load the weights"
            )
            dqn.load_weights(model_file_name)

    while True:

        ####### Load the best weights if available ####################
        """
        if ml_variables['LoadWeights'] != 'no':
            ##### Get from DB the best Profit #########################
            rs = FXU.getTableRows_db(
                "SELECT * FROM {0} WHERE Symbol = '{1}' AND MinPortfolio > {2} ORDER BY TotalProfit DESC".format(reinforce_test_tablename, env_test.symbol, (
                    minPortfolioThreshold * env_test.starting_balance)))
            firstRow = -1
            for row in rs:
                firstRow = row
                break
            if firstRow != -1:
                print("Best value : ", firstRow['TotalProfit'])
                model_file_name = "model\\duel_dqn_reward_{0}_{1}.h5f".format(env_test.symbol, int(firstRow['TotalProfit']))
                if ospath.isfile(model_file_name):
                    print("Weights for the best profit : {0} exist, so Going to load the weights".format(int(firstRow['TotalProfit'])))
                    dqn.load_weights(model_file_name)
            """

        # Train :
        dqn.fit(env,
                nb_steps=(env.split_point * training_episodes_n),
                nb_max_episode_steps=60000,
                visualize=False,
                verbose=2)
        dqn.save_weights('./model/dqnTrainingWeights_{0}.h5f'.format(
            env.symbol),
                         overwrite=True)
        try:
            info = dqn.test(env_test, nb_episodes=1, visualize=False)
            #reward = info.history['episode_reward']
            reward = env_test.portfolio - env_test.starting_balance
            print("Total Profit : ", reward)
            now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            # if reward > int(max_reward) and int(reward) != 0 and env_test.minPortfolio > (minPortfolioThreshold * env_test.starting_balance):
            #    max_reward = int(reward)

            #np.array([info.history]).dump('./info/duel_dqn_reward_{0}_{1}.info'.format(env_test.symbol, max_reward))
            if reward > 500 and env_test.minPortfolio > (
                    minPortfolioThreshold * env_test.starting_balance):
                dqn.save_weights('./model/duel_dqn_reward_{0}_{1}.h5f'.format(
                    env_test.symbol, int(reward)),
                                 overwrite=True)
            #print("Info of testing : ",info.history)
            FXU.execute_query_db(
                "INSERT INTO {0}(Symbol,StartingBalance,TotalProfit,Time,MinPortfolio) VALUES('{1}','{2}','{3}','{4}','{5}')"
                .format(reinforce_test_tablename, env_test.symbol,
                        env_test.starting_balance, reward, now,
                        env_test.minPortfolio), sqlEngine)
            #n_buys, n_lostBuys, n_sells, n_lostSells, portfolio = info['buys'], info['lostBuys'], info['sells'], info['lostBuys']
            #np.array([info]).dump('./info/duel_dqn_{0}_weights_{1}LS_{2}_{3}.info'.format(env_test.symbol, portfolio, n_buys, n_sells))
        except KeyboardInterrupt:
            return

示例#23

0

显示文件

def run_agent(agent):
    print("started new process")

    import tensorflow as tf
    from keras.backend.tensorflow_backend import set_session

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    WINDOW_LENGTH = 1

    num_actions = 3
    view_shape = (41, 41)
    input_shape = (WINDOW_LENGTH, ) + view_shape

    env = RestrictedViewTronEnv(agent, 20)

    model = Sequential()

    model.add(Permute((2, 3, 1), input_shape=input_shape))

    model.add(Conv2D(16, (3, 3), padding="same"))
    model.add(Activation("relu"))

    model.add(Conv2D(32, (3, 3), padding="same"))
    model.add(Activation("relu"))
    model.add(Flatten())

    model.add(Dense(256))
    model.add(Activation("relu"))

    model.add(Dense(num_actions, activation="linear"))

    np.random.seed(1111)

    policy = LinearAnnealedPolicy(BoltzmannQPolicy(),
                                  attr='tau',
                                  value_max=2.,
                                  value_min=.1,
                                  value_test=.1,
                                  nb_steps=1000000 // 10)

    processor = TronProcessor()

    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

    dqn = DQNAgent(model,
                   nb_actions=num_actions,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=50000 // 5,
                   gamma=.9,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(Adam(lr=.00025), metrics=["mae"])

    weights_filename = 'tmp/dqn_test_weights.h5f'
    checkpoint_weights_filename = 'tmp/dqn_test_weights_{step}.h5f'
    log_filename = 'tmp/dqn_test_log.json'
    callbacks = [
        ModelIntervalCheckpoint(checkpoint_weights_filename,
                                interval=250000 // 10)
    ]
    callbacks += [FileLogger(log_filename, interval=10000)]

    def train(transfer=False):
        print(dqn.get_config())

        if transfer:
            dqn.load_weights(weights_filename)

        dqn.fit(env,
                callbacks=callbacks,
                nb_steps=1750000 // 10,
                log_interval=10000)
        dqn.save_weights(weights_filename, overwrite=True)
        dqn.test(env, nb_episodes=20, visualize=True)

    def opponent():
        dqn.load_weights('tmp/dqn_test_weights.h5f')
        dqn.test(env, nb_episodes=200000, visualize=False)

    def test():
        dqn.load_weights('tmp/dqn_test_weights.h5f')
        #dqn.load_weights('tmp/dqn-1/dqn_test_weights.h5f')
        dqn.test(env, nb_episodes=200, visualize=False)

    # uncomment for starting an 'opponent' agent
    #opponent()

    # uncomment for training an agent
    train()  #True

示例#24

0

显示文件

文件： dqn_agent.py 项目： MCCCSunny/trading-rl

def main(train_data, test_data, FOLDER):
    """
    Initialization of all parameters, neural net, agent, training, validation and testing
    """
    write_model_info(
    )  # save in a file the parameters you are using for this model

    # set up Environment and variables
    if METHOD == trailing:
        env = TrailEnv(FOLDER,
                       STEPS,
                       train_data,
                       test_data,
                       TEST_POINTS,
                       limit_data=DATA_SIZE,
                       one_hot=ONE_HOT,
                       cost=COST,
                       margin=MARGIN,
                       turn=TURN,
                       ce=CE,
                       dp=DP,
                       normalize_in=NORMALIZE_IN,
                       reset_margin=RESET_FROM_MARGIN)
    else:
        env = DengEnv(FOLDER,
                      STEPS,
                      train_data,
                      test_data,
                      TEST_POINTS,
                      window_in=WINDOW_LENGTH,
                      limit_data=DATA_SIZE,
                      one_hot=ONE_HOT,
                      cost=COST_D)

    # set up the model
    model = set_model(env)  #构建agent模型

    memory = SequentialMemory(
        limit=MEM_SIZE,
        window_length=WINDOW_LENGTH)  #用来存储experience, limit为最多存储多少个

    # Exploration policy
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=1.0,
                                  value_min=0.1,
                                  value_test=0.05,
                                  nb_steps=EXPLORE_STEPS)
    # greedy Q策略中需要去平衡exploration与exploitation两者间关系，用这个函数来控制这个参数值
    # value_max value_min指的是从最大值开始，衰减到最小值
    # nb_steps:模型的训练步数
    nb_actions = env.action_space.n  # set up number of actions (outputs)

    # set up keras-rl agent
    dqn = DQNAgent(model=model,
                   gamma=GAMMA,
                   nb_actions=nb_actions,
                   memory=memory,
                   batch_size=BATCH_SIZE,
                   nb_steps_warmup=1000,
                   target_model_update=TAR_MOD_UP,
                   policy=policy,
                   delta_clip=DELTA_CLIP)
    # nb_steps_warmup: 在模型训练前保存多少个用于训练的batch大小

    dqn.compile(Adam(lr=LR, decay=LR_DEC), metrics=['mse'])

    if START_FROM_TRAINED:
        dqn.load_weights(TRAINED_WEIGHTS)

    if VALIDATE:
        print('=====================1=======================')
        train_w_validation(env, dqn)
    else:
        print('=====================2=======================')
        train(env, dqn)

    fin_stats(env, STEPS)  #统计多头 空头
    test(env, dqn)

示例#25

0

显示文件

model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()

# enable the dueling network
# you can specify the dueling_type to one of {'avg','max','naive'}
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               enable_dueling_network=True,
               dueling_type='avg',
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

if args.mode == 'train':
    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.

    # callbacks = []
    # if model_checkpoints:
    #     callbacks += [
    #         ModelIntervalCheckpoint(
    #             './checkpoints/checkpoint_weights.h5f',

示例#26

0

显示文件

    def __init__(self,
                 number_of_training_steps=1e5,
                 gamma=0.999,
                 load_weights=False,
                 visualize=False,
                 dueling_network=True,
                 double_dqn=True,
                 nn_type='mlp',
                 **kwargs):
        """
        Agent constructor
        :param step_size: int, number of steps to take in env for a given simulation step
        :param window_size: int, number of lags to include in observation
        :param max_position: int, maximum number of positions able to be held in inventory
        :param fitting_file: str, file used for z-score fitting
        :param testing_file: str,file used for dqn experiment
        :param env: environment name
        :param seed: int, random seed number
        :param action_repeats: int, number of steps to take in environment between actions
        :param number_of_training_steps: int, number of steps to train agent for
        :param gamma: float, value between 0 and 1 used to discount future DQN returns
        :param format_3d: boolean, format observation as matrix or tensor
        :param train: boolean, train or test agent
        :param load_weights: boolean, import existing weights
        :param z_score: boolean, standardize observation space
        :param visualize: boolean, visiualize environment
        :param dueling_network: boolean, use dueling network architecture
        :param double_dqn: boolean, use double DQN for Q-value approximation
        """
        # Agent arguments
        # self.env_name = id
        self.neural_network_type = nn_type
        self.load_weights = load_weights
        self.number_of_training_steps = number_of_training_steps
        self.visualize = visualize

        # Create environment
        self.env = gym.make(**kwargs)
        self.env_name = self.env.env.id

        # Create agent
        # NOTE: 'Keras-RL' uses its own frame-stacker
        self.memory_frame_stack = 1  # Number of frames to stack e.g., 1.
        self.model = self.create_model(name=self.neural_network_type)
        self.memory = SequentialMemory(limit=10000,
                                       window_length=self.memory_frame_stack)
        self.train = self.env.env.training
        self.cwd = os.path.dirname(os.path.realpath(__file__))

        # create the agent
        self.agent = DQNAgent(model=self.model,
                              nb_actions=self.env.action_space.n,
                              memory=self.memory,
                              processor=None,
                              nb_steps_warmup=500,
                              enable_dueling_network=dueling_network,
                              dueling_type='avg',
                              enable_double_dqn=double_dqn,
                              gamma=gamma,
                              target_model_update=1000,
                              delta_clip=1.0)
        self.agent.compile(Adam(lr=float("3e-4")), metrics=['mae'])

示例#27

0

显示文件

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Activation('relu'))
model.add(Dense(5))
model.add(Activation('relu'))
model.add(Dense(5))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy(tau=0.01)
#policy = EpsGreedyQPolicy(eps=0.2)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy, gamma=0.5)
#dqn = SarsaAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
#dqn = DDPGAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! If we visualize the training here for show, this
# slows down training quite a lot. We can also always safely abort the training prematurely using
# Ctrl + C.
env.testing = False
foo = dqn.fit(env, nb_steps=200, visualize=False, verbose = 0 )

d = datetime.utcnow()
unixtime = calendar.timegm(d.utctimetuple())
###save env.unwrapped.totalStates and env.unwrapped.actions as: [state,action] pairs
with open('sartFeedback_' + str(unixtime) + '.csv', 'wb') as f:
    writer = csv.writer(f)

示例#28

0

显示文件

model.add(Dense(128, activation='relu'))
model.add(Dense(env.action_space.n, activation='linear'))

#set the policy for action selection
policy = EpsGreedyQPolicy()
#policy = MaxBoltzmannQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)

#prints number of actions
print("--------------------------------------")
print(env.action_space.n)
print("---------------------------------------")

#deep q learning agent
player1 = DQNAgent(model=model,
                   nb_actions=env.action_space.n,
                   enable_dueling_network=True,
                   enable_double_dqn=True,
                   memory=memory,
                   target_model_update=1e-2,
                   nb_steps_warmup=2000,
                   policy=policy)  #target_model_update=1e-2,

#fits the model and saves it
player1.compile(Adam(lr=1e-3), metrics=['mae'])
player1.fit(env, action_repetition=3, nb_steps=10000, visualize=False)
print(model.summary())
player1.save_weights(r'../models/mk_18.h5f', overwrite=True)

#player1.test(env, nb_episodes=10, visualize=False)

示例#29

0

显示文件

    def __init__(self, model, explorations, trainSize, validationSize, testSize, outputFile, begin, end, nbActions, isOnlyShort, ensembleFolderName, operationCost=0):
        
        self.isOnlyShort=isOnlyShort
        self.ensembleFolderName=ensembleFolderName

        

        #Define the policy, explorations, actions and model as received by parameters
        self.policy = EpsGreedyQPolicy()
        self.explorations=explorations
        self.nbActions=nbActions
        self.model=model

        #Define the memory
        self.memory = SequentialMemory(limit=10000, window_length=1)

        #Instantiate the agent with parameters received
        self.agent = DQNAgent(model=self.model, policy=self.policy,  nb_actions=self.nbActions, memory=self.memory, nb_steps_warmup=200, target_model_update=1e-1,
                                    enable_double_dqn=True,enable_dueling_network=True)
        
        #Compile the agent with the adam optimizer and with the mean absolute error metric
        self.agent.compile(Adam(lr=1e-3), metrics=['mae'])

        #Save the weights of the agents in the q.weights file
        #Save random weights
        self.agent.save_weights("q.weights", overwrite=True)

        #Define the current starting point as the initial date
        self.currentStartingPoint = begin

        #Define the training, validation and testing size as informed by the call
        #Train: 5 years
        #Validation: 6 months
        #Test: 6 months
        self.trainSize=trainSize
        self.validationSize=validationSize
        self.testSize=testSize
        
        #The walk size is simply summing up the train, validation and test sizes
        self.walkSize=trainSize+validationSize+testSize
        
        #Define the ending point as the final date (January 1st of 2010)
        self.endingPoint=end

        #Read the hourly dataset
        #We join data from different files
        #Here hour data is read 
        self.dates= pd.read_csv('./datasets/'+MK+'Hour.csv')
        self.sp = pd.read_csv('./datasets/'+MK+'Hour.csv')
        #Convert the pandas format to date and time format
        self.sp['Datetime'] = pd.to_datetime(self.sp['Date'] + ' ' + self.sp['Time'])
        #Set an index to Datetime on the pandas loaded dataset. Registers will be indexes through these values
        self.sp = self.sp.set_index('Datetime')
        #Drop Time and Date from the Dataset
        self.sp = self.sp.drop(['Time','Date'], axis=1)
        #Just the index considering date and time will be important, because date and time will be used to define the train, 
        #validation and test for each walk
        self.sp = self.sp.index

        #Receives the operation cost, which is 0
        #Operation cost is the cost for long and short. It is defined as zero
        self.operationCost = operationCost
        
        #Call the callback for training, validation and test in order to show results for each episode 
        self.trainer=ValidationCallback()
        self.validator=ValidationCallback()
        self.tester=ValidationCallback()
        self.outputFileName=outputFile

示例#30

0

显示文件

文件： dqn_atari.py 项目： avliu/keras-rl

def build_train_test(args, timesteps):

    # Get the environment and extract the number of actions.
    env = gym.make(args.env_name)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    # Next, we build our model. We use the same model that was described by Mnih et al. (2015).
    input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE
    model = Sequential()
    if K.image_dim_ordering() == 'tf':
        # (width, height, channels)
        model.add(Permute((2, 3, 1), input_shape=input_shape))
    elif K.image_dim_ordering() == 'th':
        # (channels, width, height)
        model.add(Permute((1, 2, 3), input_shape=input_shape))
    else:
        raise RuntimeError('Unknown image_dim_ordering.')
    model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, (4, 4), strides=(2, 2)))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    processor = AtariProcessor()

    # Select a policy. We use eps-greedy action selection, which means that a random action is selected
    # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
    # the agent initially explores the environment (high eps) and then gradually sticks to what it knows
    # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
    # so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=1.,
                                  value_min=.1,
                                  value_test=.05,
                                  nb_steps=1000000)

    # The trade-off between exploration and exploitation is difficult and an on-going research topic.
    # If you want, you can experiment with the parameters or use a different policy. Another popular one
    # is Boltzmann-style exploration:
    # policy = BoltzmannQPolicy(tau=1.)
    # Feel free to give it a try!

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=50000,
                   gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)
    dqn.compile(Adam(lr=.00025), metrics=['mae'])

    if args.mode == 'train':
        # Okay, now it's time to learn something! We capture the interrupt exception so that training
        # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
        weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
        checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f'
        log_filename = 'dqn_{}_log.json'.format(args.env_name)
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=250000)
        ]
        callbacks += [FileLogger(log_filename, interval=100)]
        dqn.fit(env,
                callbacks=callbacks,
                nb_steps=timesteps,
                log_interval=10000)

        # After training is done, we save the final weights one more time.
        dqn.save_weights(weights_filename, overwrite=True)

        # Finally, evaluate our algorithm for 10 episodes.
        result = dqn.test(env, nb_episodes=10, visualize=False)
    elif args.mode == 'test':
        weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
        if args.weights:
            weights_filename = args.weights
        dqn.load_weights(weights_filename)
        result = dqn.test(env, nb_episodes=10, visualize=False)
    return result