예제 #1
0
def run_bbox(verbose=False):
    n_features = n_actions = max_time = -1

    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()

    av_table = ActionValueTable(n_features, n_actions)
    av_table.initialize(0.2)
    print av_table._params
    learner = Q(0.5, 0.1)
    learner._setExplorer(EpsilonGreedyExplorer(0.4))
    agent = LearningAgent(av_table, learner)
    environment = GameEnvironment()
    task = GameTask(environment)
    experiment = Experiment(task, agent)

    while environment.finish_flag:
        experiment.doInteractions(1)
        agent.learn()
 
    bbox.finish(verbose=1)
예제 #2
0
	def reset(self):
		#n = np.random.randint(0, self.grid_size-1, size=1)
		#m = np.random.randint(1, self.grid_size-2, size=1)
		if bbox.is_level_loaded():
			bbox.reset_level()
		else:
			bbox.load_level("../../../levels/train_level.data", verbose=1)
		self.state = bbox.get_state() #np.asarray([0, n, m])[np.newaxis]
예제 #3
0
 def reset(self):
     #n = np.random.randint(0, self.grid_size-1, size=1)
     #m = np.random.randint(1, self.grid_size-2, size=1)
     if bbox.is_level_loaded():
         bbox.reset_level()
     else:
         bbox.load_level("../../../levels/train_level.data", verbose=1)
     self.state = bbox.get_state()  #np.asarray([0, n, m])[np.newaxis]
예제 #4
0
def prepare_bbox():
    global n_features, n_actions
    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/test_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
예제 #5
0
	def is_won(self):
		#fruit_row, fruit_col, basket = self.state[0]
		final_score = bbox.get_score()
		bbox.reset_level() # bbox.finish(verbose=1)

		self.last_score = 0
		self.action_count = 0
		return final_score > 0 #fruit_row == self.grid_size-1 and abs(fruit_col - basket) <= 1
예제 #6
0
    def is_won(self):
        #fruit_row, fruit_col, basket = self.state[0]
        final_score = bbox.get_score()
        bbox.reset_level()  # bbox.finish(verbose=1)

        self.last_score = 0
        self.action_count = 0
        return final_score > 0  #fruit_row == self.grid_size-1 and abs(fruit_col - basket) <= 1
예제 #7
0
def prepare_bbox():
	global n_features, n_actions
	if bbox.is_level_loaded():
		bbox.reset_level()
	else:
		bbox.load_level("../levels/test_level.data", verbose=1)
		n_features = bbox.get_num_of_features()
		n_actions = bbox.get_num_of_actions() 
예제 #8
0
def prepare_bbox():
    global n_f, n_a, max_time
 
    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_f = bbox.get_num_of_features()
        n_a = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()
예제 #9
0
def prepare_bbox():
    global n_features, n_actions, max_time

    # Reset environment to the initial state, just in case
    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        # Load the game level
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()
예제 #10
0
def prepare_box():
    global n_features, n_actions, max_time

    # Reset the environment to the initial state, just in case
    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        # Load the game level
        bbox.load_level('levels/train_level.data', verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()
예제 #11
0
    def reset(self):
        if bbox.is_level_loaded():
            bbox.reset_level()
        else:
            bbox.load_level(self.level, verbose=1)
            self.n_features = bbox.get_num_of_features()
            self.n_actions = bbox.get_num_of_actions()
            self.max_time = bbox.get_max_time()

        self._steps = 0
        self._state = np.zeros((1, self.n_features))
        self._is_over = False
        self._prev_score = -float('inf')
        self._actions_log = []
예제 #12
0
    def reset(self):
        if bbox.is_level_loaded():
            bbox.reset_level()
        else:
            bbox.load_level(self.level, verbose=1)
            self.n_features = bbox.get_num_of_features()
            self.n_actions = bbox.get_num_of_actions()
            self.max_time = bbox.get_max_time()

        self._steps = 0
        self._state = np.zeros((1, self.n_features))
        self._is_over = False
        self._prev_score = -float('inf')
        self._actions_log = []
예제 #13
0
파일: fribot.py 프로젝트: Pyro2266/FriBOT
def prepare_bbox():

    global n_features, n_actions, max_time, vectors, pool, num_of_vectors

    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()

    vectors = np.zeros((num_of_vectors, n_features), np.float32)
    print("preparing")
    pool = multiprocessing.Pool(processes=processes)
예제 #14
0
def prepare_bbox():
    global n_features, n_actions, max_time
    ## TODO: Save the interactions with the environment as an output data frame
    global interaction_list
    interaction_list = []
    
    ## Reset the environment to initial state, just in case
    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        ## Load the game level
        bbox.load_level("../levels/train_level.data", verbose=True)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()
        
        ## The matrix that contains the output data frame
        states = ['state_'] * n_features
        state_list = [states[i] + str(i) for i in range(n_features)]
        header_list = state_list + ['reward', 'action']
        interaction_list.append(header_list)
예제 #15
0
def prepare_bbox():
    '''
    Prepares the environment (learning/test data).
    '''
    
    global n_features
    global n_actions
    global max_time
    global q_function
    global epsilon
    global gamma
    global alpha
    global valid_actions
    global init_value
    
    if bbox.is_level_loaded():
        ## Reset the environment to initial state
        bbox.reset_level()
    else:
        ## Load the training/test data
        bbox.load_level('../levels/train_level.data', verbose=True)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()
예제 #16
0
def run_bbox(verbose=False):
    prepare_bbox()
    # vector of the current state features
    input_var= T.tensor3('memory')
    input_var= T.reshape(input_var,(memtime,1,n_f+2))

    #Score after the agent makes it's choice
    reality = T.vector('score_diffs')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks their best choice is this event
    evaluation = lasagne.layers.get_output(agent)[0]

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(evaluation,reality)
    reward = reward.mean()

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9)

    #A function to get the agent's choice of what to try this time
    decide_fn = theano.function([input_var],evaluation)

    #function to do all of the stuff above
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    start = time.time()
    for epoch in range(epochs):
        memory = np.zeros(shape=(memtime,1,n_f+2))
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox
        #initialize tracking variables
        consequence=error=0
        steps=0
        trust=0.00+.02*epoch
        good=0
        while has_next:
            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #get best action based on 100 step checkpoint method
            actuals = get_all_score_diffs(state)
            #upload new state, with no score or action chosen
            memory[0][0][:-2] = state
            if rand.random()>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = decide_fn(memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best


            if action == np.argmax(actuals):
                good = good+1
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            #find consequenquence
            score = bbox.get_score()
            consequence=score-consequence
            #train on choices just made and memory
            memory[0][0][-2:]=[action,consequence]

            error += train_fn(memory,actuals) #train based on the score change

            #updating for next loop
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   current trust: {}".format(trust))
                print ("   avg error: {}".format(error/steps))
                print ("   bad choices: {}%".format(100-float(good)/100))
                print ("   current score: {}".format(score))
                if trust<.95:
                    trust = trust+.02
                bbox.clear_all_checkpoints()
                ch=ra=good=0

        #report on model quality on previous epoch
        score = bbox.get_score()
        with open("epoch_data.txt","a") as f:
        	f.write("Epoch: {}    Final Score: {}    Average Error: {}    Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60))
        #save model parameters
        np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent))
        #reset box for next epoch
        if(epoch<epochs-1):
            bbox.reset_level()

    print ("Time to run: {} hours".format((time.time()-start)/3600))
    bbox.finish(verbose=1)
예제 #17
0
def run_bbox(verbose=False):
    prepare_bbox()

    # vector of the current state features
    input_var= T.matrix('memory')
    input_var= T.reshape(input_var,(memtime,n_f+2))

    #Score after the agent makes it's choice
    reality = T.scalar('consequence')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks the best choice will be
    attempt = T.max(lasagne.layers.get_output(agent))

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(attempt,reality)

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9)

    #function to do all of the stuff above I DON'T HAVE A TARGET??
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    memory = np.zeros(shape=(memtime,n_f+2))
    start = time.time()
    scores_per_epoch = np.zeros(epochs)
    for epoch in range(epochs):
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox

        #initialize tracking variables
        consequence=0
        self_assessment=0
        steps=0
        trust=0.00
        while has_next:

            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #upload new state, with no score or action chosen
            memory[0][:-2] = state
            if rand.random>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best
            
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            consequence = bbox.get_score()-consequence 
            
            #train on choices just made and memory
            memory[0][-2:]=[action,consequence]
            train_fn(memory,consequence) #train based on the score change
            
            #updating for next loop
            self_assessment += consequence
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                trust = trust+.01
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   self assessment: {}".format(self_assessment))
                print ("   trust: {}".format(trust))
                print ("   current score: {}".format(score))
        #report on model quality on previous epoch
        score = bbox.get_score()
        print ("Epoch: {}".format(epoch))
        print ("Final Score: {}".format(score))
        print ("Time to Run: {} minutes".format((time.time()-e_time)/60))
        scores_per_epoch[epoch] = score

        #reset box for next epoch
        bbox.reset_level()

    print ("All scores per epoch: ")
    print (scores_per_epoch)
    print ("Time to run: {} hours".format((time.time()-start)/3600))
    np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
예제 #18
0
파일: q_learning.py 프로젝트: lopuhin/bbot
def main():
    epsilon = .1  # exploration
    num_actions = 4
    input_size = 36
    hidden_size = 24
    activation = 'relu'
    max_memory = 2000
    batch_size = 50
    mini_epoch = 5
    epoch = 10

    model = Sequential()
    model.add(
        Dense(hidden_size, input_shape=[input_size], activation=activation))
    model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(num_actions))
    model.compile('adam', 'mse')

    # model.load_weights('model.h5')

    # Define environment/game
    bbox.load_level('../levels/train_level.data', verbose=True)

    # Initialize experience replay object
    exp_replay = ExperienceReplay(max_memory=max_memory)

    # FIXME
    #states = np.fromfile('run_random/states', dtype=np.float32)\
    #    .reshape([1214494, 36])
    #scaler = preprocessing.StandardScaler()
    #scaler.fit(states)
    #with open('scaler.pkl', 'wb') as f:
    #    scaler = pickle.dump(scaler, f, protocol=-1)
    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    # Train
    for e in range(epoch):
        loss = 0.
        bbox.reset_level()
        game_over = False
        # get initial input
        get_state = lambda: scaler.transform(np.array([bbox.get_state()]))[0]
        input_t = get_state()
        score = 0
        step = 0
        report_steps = 100

        while not game_over:
            step += 1
            input_tm1 = input_t
            # get next action
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, num_actions, size=1)
            else:
                q = model.predict(np.array([input_tm1]))[0]
                action = np.argmax(q)

            # apply action, get rewards and new state
            game_over = not bbox.do_action(action)
            input_t = get_state()
            new_score = bbox.get_score()
            reward = new_score - score
            score = new_score

            # store experience
            exp_replay.remember([input_tm1, action, reward, input_t],
                                game_over)

            # adapt model
            for _ in range(mini_epoch):
                inputs, targets = exp_replay.get_batch(model,
                                                       batch_size=batch_size)
                loss += model.train_on_batch(inputs, targets)[0]

            if step % report_steps == 0:
                print('Step {:07d} | Loss {:.4f} | Score {}'.format(
                    step, loss / (report_steps * mini_epoch), score))
                loss = 0.

        print('Epoch {:03d}/{} | Score {}'.format(e, epoch - 1, score))

    # Save trained model weights
    model.save_weights('q_model.h5', overwrite=True)