def __init__(self, n_input, actions, file, epsilon=.05, kappa=.1, alpha=0.0002, horizon=100, gamma=0.98, qlambda=0.8, reward_as_input=False): self.epsilon = epsilon self.horizon = horizon self.alpha = alpha self.gamma = gamma self.qlambda = qlambda self.kappa = kappa CompleteLearner.__init__(self, actions, file) self.numActions = len(self.actions) self.laststate = None self.state = None self.dataset = {'state': [], 'action': [], 'reward': []} self.nn = None self.target_nn = None self.lastaction = None self.t = 0 self.reward_as_input = reward_as_input self.n_input = n_input + 1 if self.reward_as_input else n_input self.next_advantages = [0 for i in range(self.numActions)] self.nn = self.buildLSTMNetwork() self.td_nn = self.buildTDnetwork() self.td_trainer = BackpropTrainer(self.td_nn) self.TDnext = [0]
def __init__(self, n_input,actions,file,states=None, epsilon=.05, alpha=0.02, horizon=100,gamma=0.95, qlambda=0.9,network=False): self.epsilon=epsilon self.horizon=horizon self.n_input=n_input self.alpha = alpha self.gamma = gamma self.qlambda = qlambda CompleteLearner.__init__(self,actions,file) self.numActions=len(self.actions) self.laststate=None self.state=None self.dataset = {'laststate':[],'state':[],'action':[],'reward': []} self.nn = None self.target_nn=None self.t=0 self.updates=[] if network: self.nn , self.params= self.initNetwork() self.target_nn = self.nn self.target_params=self.params else: self.states=states self.Q = {} for state in self.states: self.Q[state]=[] for a in range(self.numActions): self.Q[state].append(self.gamma + random())
def __init__(self, task_features, use_task_bias, use_task_gain, n_inputs, trace_length, actions, file, episodic, loss=None, target_model=False, num_neurons=80,large_scale=False,terminal_known=False,recurrent=True, agent=None, intervals=[],params={}): self.terminal_states_known = terminal_known CompleteLearner.__init__(self, actions, file, episodic) self.init_variables() self.action_size = len(actions) if agent is None: self.agent = PPO_Learner.init_agent(n_inputs, actions, trace_length, episodic, task_features, use_task_bias, use_task_gain, num_neurons,params,large_scale, recurrent) self.state_size = PPO_Learner.set_state_size(n_inputs, trace_length,recurrent) self.continue_experiment(intervals) self.s_t = np.zeros(self.state_size) # self.model_objective = EWC_objective(lbda_task,learning_rate,batch_size,model,n_in, n_out,lbda,output_type=OutputType.linear,epochs=200, # objective_type=ObjectiveType.EWC,task_weights=None) # self.target_model_objective = EWC_objective_linoutput(out,lbda) # agent.model.compile(loss=self.model_objective.objective) # # agent.target_model.compile() print(self.__dict__) print(self.agent.__dict__)
def __init__(self, actions, n_inputs, file=''): CompleteLearner.__init__(actions, file) self.n_inputs = n_inputs self.n_actions = len(actions) self.actions = actions self.units = [ 0.0 for i in self.sensoryIndexes() + self.actionIndexes() ]
def __init__(self, n_input,actions,file,states=None, alpha=0.02, horizon=100,gamma=0.95): #self.epsilon=epsilon self.horizon=horizon self.n_input=n_input self.alpha = alpha self.gamma = gamma CompleteLearner.__init__(self,actions,file) self.t=0 self.rpg=PolicyGradient(self, Task, config_or_savefile, seed, dt=None, load='best')
def setTime(self,t): increment=t-self.t if self.episodic: self.task_t+=increment self.pols[self.current_pol].update_task_time(increment) CompleteLearner.setTime(self,t) if self.num_policies > 1: self.policy_chosen=False # allow choose_policy again when time has passed if DEBUG_MODE: self.check_policy_variables()
def __init__(self, n_input, actions, file, states=None, epsilon=.05, alpha=0.02, horizon=100, gamma=0.95, qlambda=0.9, network=False, alpha_schedule=None, batch_size=1000, reward_as_input=False, stateful=False): self.epsilon = epsilon self.horizon = horizon self.batch_size = batch_size self.n_input = n_input self.alpha = alpha self.gamma = gamma self.qlambda = qlambda self.stateful = stateful CompleteLearner.__init__(self, actions, file) self.numActions = len(self.actions) self.laststate = None self.state = None self.reward_as_input = reward_as_input if self.reward_as_input: self.n_input += 1 self.dataset = {'state': [], 'action': [], 'reward': []} self.nn = None self.target_nn = None self.t = 0 self.states_batch = [] self.targets_batch = [] self.alpha_schedule = alpha_schedule if network: self.nn, self.params = self.initNetwork() self.target_nn = self.nn self.target_params = self.params else: self.states = states self.Q = {} for state in self.states: self.Q[state] = [] for a in range(self.numActions): self.Q[state].append(self.gamma + random())
def __init__(self, n_input, actions, file, states=None, epsilon=.05, alpha=0.02, horizon=100, gamma=0.95, qlambda=0.9, network=False): self.epsilon = epsilon self.horizon = horizon self.n_input = n_input self.alpha = alpha self.gamma = gamma self.qlambda = qlambda CompleteLearner.__init__(self, actions, file) self.numActions = len(self.actions) self.laststate = None self.state = None self.dataset = { 'laststate': [], 'state': [], 'action': [], 'reward': [] } self.nn = None self.target_nn = None self.t = 0 if network: self.batch_size = BATCH_SIZE self.replay_memory = REPLAY_MEMORY self.update_freq = UPDATE_FREQ if self.replay_memory: self.replay_buffer = ReplayBuffer(self.replay_memory, batch_size=self.batch_size, horizon=self.horizon) self.nn, self.params = self.initNetwork() self.target_nn = self.nn self.target_params = self.params else: self.states = states self.Q = {} for state in self.states: self.Q[state] = [] for a in range(self.numActions): self.Q[state].append(self.gamma + random())
def __init__(self, n_input, actions, file, states=None, epsilon=.05, alpha=0.02, horizon=100, gamma=0.95, qlambda=0.9, network=False, alpha_schedule=None): self.epsilon = epsilon self.horizon = horizon self.n_input = n_input self.alpha = alpha self.gamma = gamma self.qlambda = qlambda CompleteLearner.__init__(self, actions, file) self.numActions = len(self.actions) self.laststate = None self.state = None self.dataset = { 'laststate': [], 'state': [], 'action': [], 'reward': [] } self.nn = None self.target_nn = None self.t = 0 self.alpha_schedule = alpha_schedule if network: self.net = CascadeNet(input_nodes, output_nodes, num_candidate_nodes) self.net.learn_rate = 0.05 self.net.momentum_coefficent = 0.0 self.net.output_connection_dampening = 1.0 self.net.use_quick_prop = True else: self.states = states self.Q = {} for state in self.states: self.Q[state] = [] for a in range(self.numActions): self.Q[state].append(self.gamma + random())
def printDevelopmentAtari(self,frames): CompleteLearner.printDevelopmentAtari(self,frames) if self.t % self.policy_space_picture_freq == 0: if self.do_taskdrift(): self.stats.develop() self.get_performance_diversity() self.stats.diversity.append(self.get_diversity()) self.stats.provision.append([prov for prov in self.provision]) for task in self.task_coords: self.stats.snapshot_taskcoords[task].append(self.task_coords[task]) for pol in range(self.num_policies): self.stats.mindist_performances[task][pol].append(self.avg_velocities[task][pol]) else: if self.num_policies > 1: self.stats.diversity.append(self.get_diversity())
def __init__(self,episodic,actions,filename,pols,weights,decision_frequency=100,stepsize=0.10,init_provision=1.0, consumption=.10,reward_scale=.50,initialise_unseen=False,one_to_one=False,unadaptive=False, probability_rule=ProbabilityRule.epsilon): CompleteLearner.__init__(self,actions,filename) self.occurence_weights=weights self.pols=pols self.num_policies=len(pols) self.init_provision=init_provision*self.num_policies self.provision=[self.init_provision for _ in range(self.num_policies)] self.testing_ticks=0 self.consumption=consumption self.reward_scale=reward_scale*consumption*self.num_policies self.stepsize=stepsize self.decision_frequency=decision_frequency self.current_feature=None self.one_to_one = one_to_one self.unadaptive=unadaptive self.probability_rule=probability_rule if self.do_taskdrift(): self.construct_policy_space() self.current_pol=0 # convention self.policy_chosen=False self.episodic=episodic self.initialise_unseen=initialise_unseen if self.initialise_unseen: self.N_trainings={} #self.offline_updates=offline_updates if self.episodic: self.task_t=0 self.task_R=0.0 if weights is not None: self.initialise_task_coords()
def __init__(self, n_input,actions, alpha=0.5, gamma=0.99, qlambda=0.9,explorer=EpsilonGreedyExplorer(epsilon=0.20,decay=1)): CompleteLearner.__init__(self,actions) controller = ActionValueNetwork(dimState=n_input, numActions=len(actions)) learner = NFQ() learner.explorer = explorer self.learning_agent = LearningAgent(controller, learner)
def setReward(self,reward): CompleteLearner.setReward(self,reward) self.add_sample()
def setReward(self,reward): CompleteLearner.setReward(self,reward) self.pols[self.current_pol].update_task_reward(reward) if self.episodic: self.task_R+=reward
def save_stats(self,filename): CompleteLearner.save_stats(self,filename) for pol in range(self.num_policies): self.pols[pol].save_stats(filename+'pol%d'%(pol))