def reset(self): LearningAgent.reset(self) self._temperature = self.init_temperature self._expl_proportion = self.init_exploration self.learner.reset() self._oaro = None self.newEpisode()
def giveReward(self, r): LearningAgent.giveReward(self, r) if self.previousobs is not None: #print self.previousobs, a, self.lastreward, self.lastobs self.learner._updateWeights(self.previousobs, self.previousaction, self.previousreward, self.lastobs) self.previousobs = self.lastobs self.previousaction = self.lastaction self.previousreward = self.lastreward
def __init__(self, module, learner = None): LearningAgent.__init__(self, module, learner) self.module = module self.learner = learner self.data = None self.step = 0 self.lastobs = None self.lastaction = None self.lastreward = None self.max_reward = 0.0
def __init__(self, module, learner=None): LearningAgent.__init__(self, module, learner) self.module = module self.learner = learner self.data = None self.step = 0 self.lastobs = None self.lastaction = None self.lastreward = None self.max_reward = 0.0
class DeepQLearner(CompleteLearner): """ Q-lambda is a variation of Q-learning that uses an eligibility trace. """ def __init__(self, n_input,actions, alpha=0.5, gamma=0.99, qlambda=0.9,explorer=EpsilonGreedyExplorer(epsilon=0.20,decay=1)): CompleteLearner.__init__(self,actions) controller = ActionValueNetwork(dimState=n_input, numActions=len(actions)) learner = NFQ() learner.explorer = explorer self.learning_agent = LearningAgent(controller, learner) def setAction(self): #determine self.chosenAction self.chosenAction = self.actions[int(round(self.learning_agent.getAction()))] def learn(self): print('epsilon:' + str(self.learning_agent.learner.explorer.epsilon)) if(self.t > 0 and self.t % 1000 == 0): self.learning_agent.learner.learn() self.learning_agent.lastobs = None self.learning_agent.lastaction = None self.learning_agent.lastreward = None self.learning_agent.history.clear() def setObservation(self,agent,environment): environment.setObservation(agent) self.t = environment.t self.learning_agent.integrateObservation(self.observation) print("observation =" + str(self.observation)) def cycle(self,agent,environment): self.setObservation(agent,environment) self.setAction() self.performAction(agent,environment) self.setReward(environment) self.learn() def performAction(self, agent, environment): print('chosenAction='+str(self.chosenAction.function.__name__)) self.chosenAction.perform([agent,environment]) def reset(self): pass def setReward(self,environment): self.r = environment.currentTask.reward_fun(environment.agent, environment) self.R +=self.r self.learning_agent.giveReward(self.r)
def mlDriver(cv, stateTransfer, actionTransfer): #parameter setup #dimensionality of state argument (could be less than stateTransfer) stateDim = 352 #Number of moves possible numMoves = 361 env = SettleEnv(cv, stateTransfer, actionTransfer) task = SettleTask(env) controller = RestrictedActionValueNetwork(stateDim, numMoves, env) learner = NFQ() learner.explorer = EpsilonHackedExplorer(env) agent = LearningAgent(controller, learner) experiment = EpisodicExperiment(task, agent) while True: experiment.doEpisodes(10) print "Done with experiments" agent.learn() print "Learned" agent.reset() print "Cycled"
def __init__(self): self.this_ai_count = planet_environment.ai_count self.finished = False planet_environment.ai_count += 1 self.nextMove = (0, 0, 0) self.state = None network = buildNetwork(INPUT_NEURON_COUNT, 100, 3) enac_learner = ENAC() learning_agent = LearningAgent(network, enac_learner) self.experiment = planet_experiment(episodic_planet_task(self), learning_agent) self.experiment.task.clipping = False
sys.exit('please give 4 parameters. run: "python play_catpole.py <p1> <p2> <p3> <p4>"\n') # create environment env = CartPoleEnvironment() env.setRenderer(CartPoleRenderer()) env.getRenderer().start() env.delay = (episodes == 1) # create task task = BalanceTask(env, epilen) # create controller network net = buildNetwork(4, 1, bias=False) # create agent and set parameters from command line agent = LearningAgent(net, None) agent.module._setParameters([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])]) # create experiment experiment = EpisodicExperiment(task, agent) experiment.doEpisodes(episodes) # run environment ret = [] for n in range(agent.history.getNumSequences()): returns = agent.history.getSequence(n) reward = returns[2] ret.append( sum(reward, 0).item() ) # print results print ret, "mean:",mean(ret)
'please give 4 parameters. run: "python play.py <p1> <p2> <p3> <p4>"\n' ) # create environment env = CartPoleEnvironment() env.setRenderer(CartPoleRenderer()) env.getRenderer().start() env.delay = (episodes == 1) # create task task = BalanceTask(env, epilen) # create controller network net = buildNetwork(4, 1, bias=False) # set parameters from command line # create agent agent = LearningAgent(net, None) agent.module._setParameters( array([ float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4]) ])) agent.disableLearning() # create experiment experiment = EpisodicExperiment(task, agent) experiment.doEpisodes(episodes) ret = [] for n in range(agent.history.getNumSequences()): returns = agent.history.getSequence(n) reward = returns[2]
if len(sys.argv) < 5: sys.exit('please give 4 parameters. run: "python play.py <p1> <p2> <p3> <p4>"\n') # create environment env = CartPoleEnvironment() env.setRenderer(CartPoleRenderer()) env.getRenderer().start() env.delay = (episodes == 1) # create task task = BalanceTask(env, epilen) # create controller network net = buildNetwork(4, 1, bias=False) # set parameters from command line # create agent agent = LearningAgent(net, None) agent.module._setParameters(array([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])])) agent.disableLearning() # create experiment experiment = EpisodicExperiment(task, agent) experiment.doEpisodes(episodes) ret = [] for n in range(agent.history.getNumSequences()): returns = agent.history.getSequence(n) reward = returns[2] ret.append( sum(reward, 0).item() ) print ret, "mean:",mean(ret) env.getRenderer().stop()
def __init__(self, learner): LearningAgent.__init__(self, None, learner) self.learner = learner self.previousobs = None
def main(): inData=createDataset() env = MarketEnvironment(inData) task = MaximizeReturnTask(env) numIn=min(env.worldState.shape) net=RecurrentNetwork() net.addInputModule(BiasUnit(name='bias')) net.addOutputModule((SignLayer(1,name='out'))) net.addRecurrentConnection(FullConnection(net['out'], net['out'], name='c3')) net.addInputModule(LinearLayer(numIn,name='in')) net.addConnection(FullConnection(net['in'],net['out'],name='c1')) net.addConnection((FullConnection(net['bias'],net['out'],name='c2'))) net.sortModules() ###net._setParameters([1.89523389, 2.41243781, -0.37355216, 0.60550426, 1.29560957, -1.14727503, -1.80005888, 0.66351325, 1.91905451]) ###net._setParameters([ 1.07300605, 2.37801446, -0.28118081, -0.78715898, 0.13367809, 0.31757825,-1.23956247, 1.90411791, 0.95458375]) ##net._setParameters([1.35840492,1.87785682, -0.15779415, -0.79786631, 0.13380422, 0.0067797, -1.28202562, 2.38574234, 0.909462]) ###net._setParameters([ 0.36062235, 1.70329005, 2.24180157, 0.34832656, 0.31775365, -0.60400026, -0.44850303, 1.50005529, -0.99986366]) net._setParameters([ 1.15741417, 1.70427034, 1.05050831, -0.47303435, -0.87220272, -1.44743793, 0.93697461, 2.77489952, 0.27374758]) ts=env.ts learner = RRL(numIn+2,ts) # ENAC() #Q_LinFA(2,1) agent = LearningAgent(net,learner) exp = ContinuousExperiment(task,agent) # in sample learning in_sample_len=500 print("Before in sample {}".format(net._params)) for i in range(100): exp.doInteractionsAndLearn(in_sample_len) learner.reset() agent.reset() env.reset() # ouy of sample, online learning print("Before oos {}".format(net._params)) exp.doInteractionsAndLearn(len(ts)-1) print("After oos {}".format(net._params)) #performance evaluation dfIndex=inData['RETURNS'].index rf=0#inData['Fed Fund Target'] outDataOOS=pE.outData(ts,env.actionHistory,dfIndex,startIndex=in_sample_len) sharpe_oos=pE.annualisedSharpe(outDataOOS['trading rets'],rf) drawDown_oos=pE.maximumDrawdown(outDataOOS['trading rets']) numOutperformedMonths_oos=pE.percentOfOutperformedMonths(outDataOOS['trading rets'],outDataOOS['ts']) foo=outDataOOS['cum_log_rets'][-1] bar=math.exp(foo) traderReturn=math.exp(outDataOOS['cum_log_rets'][-1])-1 benchmarkReturn=math.exp(outDataOOS['cum_log_ts'].values[-1])-1 print( "oos sharpe: {}, \noos drawdown: {} \noos percent outperformed months {}\noos trader return {}".format(sharpe_oos, drawDown_oos, numOutperformedMonths_oos,traderReturn)) paramHist=learner.paramHistory inData.rename(columns={'RETURNS': 'r(t-1)'},inplace=True) lbs=insert(inData.columns.values,0,'Bias') lbs=append(lbs,'F(t-1)') plt.figure(0) for i in range(len(net._params)): if i<7: plt.plot(paramHist[i],label=lbs[i]) else: plt.plot(paramHist[i],'--',label=lbs[i]) plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),ncol=3) plt.draw() fix, axes = plt.subplots(nrows=2,ncols=1) plotFrame=outDataOOS[['cum_log_ts','cum_log_rets']] plotFrame.columns=['Buy and Hold','Trading Agent'] plotFrame.plot(ax=axes[0]) outDataOOS['Action_Hist'].plot(ax=axes[1],color='r') plt.draw() plt.show()
def __init__(self, n_input,actions, alpha=0.5, gamma=0.99, qlambda=0.9,explorer=EpsilonGreedyExplorer(epsilon=0.20,decay=1)): CompleteLearner.__init__(self,actions) controller = ActionValueNetwork(dimState=n_input, numActions=len(actions)) learner = NFQ() learner.explorer = explorer self.learning_agent = LearningAgent(controller, learner)
def __init__(self, controller, learner): ''' Constructor ''' LearningAgent.__init__(self, controller, learner)