def createAgentCont(dimAct,dimState,numBatchToKeep, numIterPerTrain): from pybrain.rl.learners import NFQ sizeBatch = numIterPerTrain learner = NFQ(sizeBatch, numBatchToKeep) # Use neuro-fitted Q learning (use Q learning with neural network instead of lookup table) # Create a neural network model with dimState inputs and dimAct outputs # Then network itself has dimState + dimAct input and 1 output numHidden = 20 print('Using this many hidden layer neurons: ', numHidden) moduleNet = ActionValueNetwork(dimState, dimAct, numHidden); moduleNet.name = 'moduleNet' # Create a learning agent, using both the module and the learner agent = LearningAgent(moduleNet, learner) return agent
def initExperiment(learnalg='Q', history=None, binEdges='10s', scriptfile='./rlRunExperiment_v2.pl', resetscript='./rlResetExperiment.pl'): if binEdges == '10s': centerBinEdges = centerBinEdges_10s elif binEdges == '30s': centerBinEdges = centerBinEdges_30s elif binEdges == 'lessperturbed': centerBinEdges = centerBinEdges_10s_lessperturbed elif binEdges is None: centerBinEdges = None else: raise Exception("No bins for given binEdges setting") env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript) if history is not None: env.data = history['data'] task = OmnetTask(env, centerBinEdges) if history is not None: task.allrewards = history['rewards'] if learnalg == 'Q': nstates = env.numSensorBins**env.numSensors if history is None: av_table = ActionValueTable(nstates, env.numActions) av_table.initialize(1.) else: av_table = history['av_table'] learner = Q(0.1, 0.9) # alpha, gamma learner._setExplorer(EpsilonGreedyExplorer(0.05)) # epsilon elif learnalg == 'NFQ': av_table = ActionValueNetwork(env.numSensors, env.numActions) learner = NFQ() else: raise Exception("learnalg unknown") agent = LearningAgent(av_table, learner) experiment = Experiment(task, agent) if history is None: experiment.nruns = 0 else: experiment.nruns = history['nruns'] return experiment
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ", i, int( numpy.mean(score_list)), max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
from scipy import * import sys, time from pybrain.rl.learners.valuebased import ActionValueNetwork from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA, NFQ from pybrain.rl.experiments.episodic import EpisodicExperiment from pybrain.rl.environments import Task from tasktest import TestTask from envtest import TestEnv env = TestEnv() task = TestTask(env) controller = ActionValueNetwork(200, 3) learner = NFQ() agent = LearningAgent(controller, learner) experiment = EpisodicExperiment(task, agent) i = 0 while True: experiment.doEpisodes(10) print "Learning" agent.learn() agent.reset() i += 1 print "Cycle: %d" % i if i > 60: agent.learning = False
from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask from pybrain.rl.agents import LearningAgent from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from pybrain.rl.experiments import EpisodicExperiment from training import NFQTraining task = DiscreteBalanceTask(CartPoleEnvironment(), 100) action_value_function = ActionValueNetwork(4, 3, name='CartPoleNFQActionValueNetwork') learner = NFQ() #learner.gamma = 0.99 learner.explorer.epsilon = 0.4 task.discount = learner.gamma agent = LearningAgent(action_value_function, learner) performance_agent = LearningAgent(action_value_function, None) experiment = EpisodicExperiment(task, agent) tr = NFQTraining('cartpole_nfq', experiment, performance_agent) tr.train(7000, performance_interval=1, n_performance_episodes=5)
from numpy import array, arange, meshgrid, pi, zeros, mean from matplotlib import pyplot as plt # switch this to True if you want to see the cart balancing the pole (slower) render = False #render = True plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() module = ActionValueNetwork(4, 3) task = DiscreteBalanceTask(env, 100) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw()
from pybrain.rl.agents import LearningAgent from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from pybrain.rl.experiments import EpisodicExperiment from environment import Environment from tasks import BalanceTask from training import NFQTraining task = BalanceTask() action_value_function = ActionValueNetwork(task.outdim, task.nactions, name='BalanceNFQActionValueNetwork') learner = NFQ() learner.gamma = 0.9999 learner.explorer.epsilon = 0.9 task.discount = learner.gamma agent = LearningAgent(action_value_function, learner) performance_agent = LearningAgent(action_value_function, None) experiment = EpisodicExperiment(task, agent) tr = NFQTraining('balance_nfq', experiment, performance_agent) tr.train(7000, performance_interval=1, n_performance_episodes=1, plotsave_interval=10, plot_action_history=True)
if len( sys.argv ) < 2 or ( int( sys.argv[1] ) < 0 or int( sys.argv[1] ) > 4 ): print 'Must supply a model type:\n\t1 = Uncert&Salience, 2 = Salience, 3 = Uncert, 4 = Activation!' sys.exit() if len( sys.argv ) < 3: print 'Must supply an output file!' sys.exit() type = int( sys.argv[1] ) # 1 = Uncert&Salience, 2 = Salience, 3 = Uncert, 4 = Activation env = DistractorRatio() # Create an instance of the D-R task # Create an action/value neural net with an state space of 100 and an action space of 8 if type == 1: module = ActionValueNetwork( 99, 7 ) else: module = ActionValueNetwork( 51, 4 ) learner = NFQ() learner.offPolicy = False # Disable off policy learning #learner.explorer = HumanExplorer() learner.explorer.epsilon = 0.4 #learner.explorer.decay = 0.99 agent = HumanAgent( module, learner, type ) # Create an agent that learns with NFQ testagent = HumanAgent( module, None, type ) # Create a testing agent experiment = CustomEpisodicExperiment( env, agent ) # Put the agent in the environment if len( sys.argv ) == 4: print 'Loading saved net...' module.network = NetworkReader.readFrom( sys.argv[3] )
from scipy import * import sys, time from pybrain.rl.learners.valuebased import ActionValueNetwork from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA, NFQ from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task from tasktest import TestTask from envtest import TestEnv env = TestEnv() task = TestTask(env) controller = ActionValueNetwork(1, 2) learner = NFQ() agent = LearningAgent(controller, learner) experiment = Experiment(task, agent) while True: experiment.doInteractions(100) agent.learn() agent.reset() print("Cycle")
def q_learning_nfq(**args): # estimate best_score = 0 best_turn = 1000 best_agent = None score_list = [] turn_list = [] #for i in range(2): for i in range(50): # agent 初期化 # rand = 1.0 # learner = NFQ(maxEpochs=100) # learner._setExplorer(EpsilonGreedyExplorer(rand)) controller = ActionValueNetwork(12, 4) learner = NFQ() agent = LearningAgent(controller, learner) # training print print "===========================" print 'before training' print_state(agent.module.getValue) training(agent, args) print 'after training' print_state(agent.module.getValue) agent.learner._setExplorer(EpsilonGreedyExplorer(0.3)) score, turn = play(agent, 'neural', args, [2, 2]) score_list.append(score) turn_list.append(turn) print print i, int(numpy.mean(score_list)), max(score_list), score, turn # if i % args['episodes'] == 0: # try: # agent.learn() # except: # pass # finally: # agent.reset() # # rand = fit_greedy(i) # # agent.learner._setExplorer(EpsilonGreedyExplorer(rand)) # # if not i == 0 : # # import sys # # sys.exit() # print i, int(numpy.mean(score_list)) , max(score_list) , score, turn if best_score < score or best_turn > turn: best_score = score best_turn = turn best_agent = agent with open(args['path'] + '/result.dump', 'w') as f: pickle.dump([score_list, turn_list, best_agent], f) print print "===========================" print 'best score : ', best_score print 'best turn : ', best_turn print_state(agent.module.getValue)