Пример #1
0
def run_bbox(verbose=False):
    '''
    Runs the Blackbox challenge.
    '''
    
    has_next = True
    prepare_bbox()
    
    while has_next:
        ## Observe the current state variables
        state = bbox.get_state()
        state_tuple = get_state_tuple(state)
        ## Select the current action
        action = get_action(state_tuple, verbose=verbose, is_current=True)
        ## Get the current reward
        reward = bbox.get_score()
        print 'Reward = ' + str(reward)
        
        ## Retrieve the current Q-value
        current_q = q_function[state_tuple][action]
        print 'Current Q = ' + str(current_q)
        
        ## Observe the next state (assuming there always is)
        has_next = bbox.do_action(action)
        next_state = bbox.get_state()
        next_state_tuple = get_state_tuple(next_state)
        ## Get the best q_action in the new state
        next_action = get_action(next_state_tuple, verbose=verbose, is_current=False)    
        ## Get the new Q_value
        next_q = q_function[next_state_tuple][next_action]
        ## Update the Q-function
        q_function[state_tuple][action] = (1 - alpha) * current_q + alpha * (reward + gamma * next_q)
        print 'Updated Q = ' + str(q_function[state_tuple][action])
    
    bbox.finish(verbose=True)
Пример #2
0
def do_play(bot, params, levels, runs, prngs_seed, verbosity, **kwargs):
    """
    Evaluates the bot with params on some levels.
    """
    common_printoptions()
    prngs_seed = seed_prngs(prngs_seed)

    start = clock()
    bot_class = available_bots[bot]
    params_key, params = load_params(bot, params, verbosity)[:2]
    scores = odict()
    for level in levels:
        level = load_level(level, verbosity)
        scores[level['key']] = bot_class(level, params).evaluate(runs)
        finish(verbose=verbosity > 3)
    end = clock()

    return {'date': datetime.utcnow(),
            'bot': bot,
            'params_key': params_key,
            'levels': levels,
            'runs': runs,
            'scores': scores,
            'time': end - start,
            'prngs_seed': prngs_seed}
Пример #3
0
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    #vector of the current state features
    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(1,n_features))
    #Load net into the agent object
    agent=prepare_agent(input_var)
    attempt = lasagne.layers.get_output(agent)
    #function to do all of the stuff above
    eval_fn = theano.function([input_var], attempt,on_unused_input='ignore')
    #time to check how long it takes to run
    start = time.time()
    error=0
    steps=0
    while has_next:
        state = bbox.get_state()
        r_state= np.reshape(state,(1,n_features))
        attempt = eval_fn(r_state)
        action = np.argmax(attempt)
        steps+=1
        if steps%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   training loss: {}".format(error/steps))
            print ("   current score: {}".format(score))
        has_next = bbox.do_action(action)
 
    print ("Time to run: {} seconds".format(time.time()-start))
    print ("{} steps total".format(steps))
    np.savez('model.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
Пример #4
0
def run_bbox(verbose=False):
    n_features = n_actions = max_time = -1

    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()

    av_table = ActionValueTable(n_features, n_actions)
    av_table.initialize(0.2)
    print av_table._params
    learner = Q(0.5, 0.1)
    learner._setExplorer(EpsilonGreedyExplorer(0.4))
    agent = LearningAgent(av_table, learner)
    environment = GameEnvironment()
    task = GameTask(environment)
    experiment = Experiment(task, agent)

    while environment.finish_flag:
        experiment.doInteractions(1)
        agent.learn()
 
    bbox.finish(verbose=1)
Пример #5
0
def do_play(bot, params, levels, runs, prngs_seed, verbosity, **kwargs):
    """
    Evaluates the bot with params on some levels.
    """
    common_printoptions()
    prngs_seed = seed_prngs(prngs_seed)

    start = clock()
    bot_class = available_bots[bot]
    params_key, params = load_params(bot, params, verbosity)[:2]
    scores = odict()
    for level in levels:
        level = load_level(level, verbosity)
        scores[level['key']] = bot_class(level, params).evaluate(runs)
        finish(verbose=verbosity > 3)
    end = clock()

    return {
        'date': datetime.utcnow(),
        'bot': bot,
        'params_key': params_key,
        'levels': levels,
        'runs': runs,
        'scores': scores,
        'time': end - start,
        'prngs_seed': prngs_seed
    }
Пример #6
0
def run_bbox(verbose=False):
    '''
    Runs the Blackbox challenge.
    '''
    
    has_next = True
    
    ## Prepare the environment -- load the game level
    prepare_bbox()
    
    while has_next:
        ## Get the current environment state vector
        state = bbox.get_state()
        ## Choose an action to perform at the current state
        action = get_action_by_state(state, verbose=verbose)
        ## Function do_action(action) returns False if the level
        ## is finished; otherwise, it returns True
        has_next = bbox.do_action(action)
    
    ## Save the interactions as an output CSV file
    headers = interaction_list.pop(0)
    interaction_df = pd.DataFrame(interaction_list, columns=headers)
    datetime_int = int(calendar.timegm(time.gmtime()))
    out_filename = '../output/interaction_' + str(datetime_int) + '.csv'
    interaction_df.to_csv(out_filename, index=False)
    print 'Saved to file: ' + out_filename
    
    ## When submitting solution, make sure to call finish(), which returns the sum of points obtained
    ## during the entire simulation. This number is used as the public leader board score
    bbox.finish(verbose=True)
Пример #7
0
def run_bbox():
	f_35_penalty = 0.15; k = 0; w0 = 0.13
	bbox.load_level("levels/test_level.data", verbose=0)
	has_next = True; last_score = 0
	act = -1; act_len = 0; crit_len = 150
	predict = np.zeros(2); cum_sum = np.zeros(4)
	while has_next:
		last_act = act
		state = bbox.get_state()
		predict[:2] = np.dot(lr_coefs_1,state[:-1]) + lr_free_coefs_1

		if state[35] > 0:
			cum_sum[1] = predict[0] + k
			cum_sum[2] = -predict[0] + k
		elif state[35] < 0:
			cum_sum[1] = -predict[1] + k
			cum_sum[2] = predict[1] + k
		elif state[35] == 0:
			cum_sum[1] = predict[0] + k
			cum_sum[2] = predict[1] + k

		cum_sum[0] = (cum_sum[1]+cum_sum[2])/2 + k
		cum_sum[1]-=f_35_penalty*state[35]
		cum_sum[2]+=f_35_penalty*state[35]
		if act_len > crit_len: cum_sum[last_act]-=0.0078125
		act = (w0*(np.dot(lr_coefs_0,state) + lr_free_coefs_0)/6.366 + (1-w0)*cum_sum).argmax()

		has_next = bbox.do_action(act)
		if last_act==act: act_len+=1
		else: act_len = 0

	bbox.finish(verbose=1)
Пример #8
0
def test_bot(bot, level, make_features):
    env = BBox(level)
    while env.has_next:
        if env.get_time() % 10000 == 0:
            print str(env.get_time()) + "\t" + str(env.get_score())
        action = bot.get_action(make_features(env))
        env.do_action(action)
    bbox.finish()
    print bbox.get_score()
Пример #9
0
def test_bot(bot, level, make_features):
    env = BBox(level)
    while env.has_next:
        if env.get_time() % 10000 == 0:
            print str(env.get_time()) + "\t" + str(env.get_score())
        action = bot.get_action(make_features(env))
        env.do_action(action)
    bbox.finish()
    print bbox.get_score()
Пример #10
0
def run_bbox():
	has_next = 1	
	prepare_bbox()
	load_regression_coefs("reg_coefs.txt")
	
	while has_next:
		state = bbox.get_state()
		action = get_action_by_state(state)
		has_next = bbox.do_action(action)
	
	bbox.finish(verbose=1)
Пример #11
0
def run_bbox(verbose=False):
    has_next = 1
    
    prepare_bbox()

    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
 
    bbox.finish(verbose=1)
Пример #12
0
def run_bbox(verbose=False):
    has_next = 1
    
    prepare_bbox()
 
    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
 
    bbox.finish(verbose=1)
Пример #13
0
def run_bbox():
    has_next = 1

    prepare_bbox()
    load_regression_coefs("reg_coefs.txt")

    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)

    bbox.finish(verbose=1)
Пример #14
0
def run_bbox():
	global ensamble
	has_next = 1 
	prepare_bbox()  
	ensamble=Ensemble.NN_Ensemble(n_features,4,[[36,64,4],[16,4],[16,4],[36,64,4]],n_actions)  
	ensamble.read_weights("weights")
	
	while has_next: 
		state = bbox.get_state() 
		action = get_action_by_state(state)
		has_next = bbox.do_action(action)   
		if(bbox.get_time()%10000==0): 
			print(str(bbox.get_time())+" "+str(bbox.get_score()))
	bbox.finish(verbose=1)
def run_bbox():
	has_next = 1
	
	prepare_bbox()

	while has_next:
		best_act = calc_best_action_using_checkpoint()	
		for _ in range(100):
			has_next = bbox.do_action(best_act)

		if bbox.get_time() % 10000 == 0:
			print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score()))
 
	bbox.finish(verbose=1)
Пример #16
0
def run_bbox():
    global ensamble
    has_next = 1
    prepare_bbox()
    ensamble = Ensemble.NN_Ensemble(
        n_features, 4, [[36, 64, 4], [16, 4], [16, 4], [36, 64, 4]], n_actions)
    ensamble.read_weights("weights")

    while has_next:
        state = bbox.get_state()
        action = get_action_by_state(state)
        has_next = bbox.do_action(action)
        if (bbox.get_time() % 10000 == 0):
            print(str(bbox.get_time()) + " " + str(bbox.get_score()))
    bbox.finish(verbose=1)
Пример #17
0
def run_bbox():
    has_next = 1

    prepare_bbox()

    while has_next:
        best_act = calc_best_action_using_checkpoint()
        for _ in range(100):
            has_next = bbox.do_action(best_act)

        if bbox.get_time() % 10000 == 0:
            print("time = %d, score = %f" %
                  (bbox.get_time(), bbox.get_score()))

    bbox.finish(verbose=1)
Пример #18
0
def run_bbox(rnet_model, train_data,
             train_level=True, verbose=True):
  """
  Run a single session of the black box training or test environments
  :param rnet_model: model with a get_action(state) method
  :param train_data: a DataSet object used to buffer each state
  :param train_level: boolean, run the training level if True
  :param verbose: boolean, display additional information if True
  :return: float, the final session score
  """
  has_next = 1
  prepare_bbox(train_level)
  train_data.clear_buffer()

  while has_next:
    step_count = bbox.get_time()
    train_data.update_buffer(bbox.get_state())
    state = train_data.get_buffer()
    action = rnet_model.get_action(state)
    has_next = bbox.do_action(action)

    if step_count % 5000 == 0 and verbose:
      print ("time = %d, score = %f" % (step_count, bbox.get_score()))

  final_score = bbox.finish(verbose=1)
  return final_score
Пример #19
0
def run_bbox(rnet_model, train_data, train_level=True, verbose=True):
    """
  Run a single session of the black box training or test environments
  :param rnet_model: model with a get_action(state) method
  :param train_data: a DataSet object used to buffer each state
  :param train_level: boolean, run the training level if True
  :param verbose: boolean, display additional information if True
  :return: float, the final session score
  """
    has_next = 1
    prepare_bbox(train_level)
    train_data.clear_buffer()

    while has_next:
        step_count = bbox.get_time()
        train_data.update_buffer(bbox.get_state())
        state = train_data.get_buffer()
        action = rnet_model.get_action(state)
        has_next = bbox.do_action(action)

        if step_count % 5000 == 0 and verbose:
            print("time = %d, score = %f" % (step_count, bbox.get_score()))

    final_score = bbox.finish(verbose=1)
    return final_score
Пример #20
0
def run_bbox():
    
    start_time = time.time()
    
    has_next = 1
    
    prepare_bbox()
    coefs = load_regression_coefs("star 13-best_coefs_score=2980.401123046875_sigma=0.0010000000474974513_level=train_level.txt")
    state = np.ones(n_features + 1)
 
    while has_next:
        state[:-1] = bbox.get_state()
        action = get_action_by_state(state, coefs)
        has_next = bbox.do_action(action)
 
    bbox.finish(verbose=1)

    end_time = time.time()
    print(end_time - start_time)
Пример #21
0
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    # vector of the current state features
    input_var= T.matrix('in_state')
    input_var= T.reshape(input_var,(1000,n_features))
    #vector of the scores for 100 of the same action
    target_var = T.matrix('scores')
    target_var = T.reshape(target_var,(1000,n_actions))
    #Load net into the agent object
    agent=prepare_agent(input_var)
    #what the agent thinks will happen if it does each action 100 times
    attempt = lasagne.layers.get_output(agent)
    #how much the agent was wrong, and should be punished
    punish = lasagne.objectives.squared_error(attempt,target_var)
    punish = punish.mean()
    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)
    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=0.001,momentum=0.9)
    #function to do all of the stuff above
    train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore')
    # time to check how long it takes to run
    start = time.time()

    states, scores, loops = load_dataset('Full.txt')
    for n in range(loops):
        error=0
        steps=0
        ins = states[n:n+15]
        out = scores[n:n+15]
        action = np.argmax(out[0])
        error = train_fn(ins,out)
        if n%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   training loss: {}".format(error))
            print ("   current score: {}".format(score))
        has_next = bbox.do_action(action)
 
    print ("Time to run: {} seconds".format(time.time()-start))
    np.savez('model.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
Пример #22
0
def run_bbox(verbose=False):
    has_next = 1

    # Prepare environment - load the game level
    prepare_bbox()

    while has_next:
        # Get current environment state
        state = bbox.get_state()

        # Choose an action to perform at current step
        action = get_action_by_state(state)

        # Perform chosen action
        # Function do_action(action) returns False if level is finished, otherwise returns True.
        has_next = bbox.do_action(action)

    # Finish the game simulation, print earned reward
    # While submitting solutions, make sure that you do call finish()
    bbox.finish(verbose=1)
Пример #23
0
def run_bbox(verbose=False):
    bbox.load_level("../levels/train_level.data", verbose=True)

    states, actions, scores, rewards = [], [], [], []
    with open('utility_models.pkl', 'rb') as f:
        utility_models = pickle.load(f)

    step = 0
    has_next = 1
    while has_next:
        step += 1
        state = bbox.get_state()
        action = np.random.choice(n_actions)
        utilities = [m.predict([state]) for m in utility_models]
        action = np.argmax(utilities)
        # Do action and bookkeeping
        has_next = bbox.do_action(action)
        states.append(np.array(state))
        actions.append(action)
        score = bbox.get_score()
        rewards.append(score if not scores else (score - scores[-1]))
        scores.append(score)
        if verbose and step % 10000 == 0:
            print(step, score)

    i = 1
    get_outdir = 'run_{}'.format
    outdir = get_outdir(i)
    while os.path.exists(outdir):
        i += 1
        outdir = get_outdir(i)
    os.mkdir(outdir)
    print('saving to {}'.format(outdir))
    scores = np.array(scores, dtype=np.float32)
    scores.tofile(os.path.join(outdir, 'scores'))
    actions = np.array(actions, dtype=np.int8)
    actions.tofile(os.path.join(outdir, 'actions'))
    states = np.array(states, dtype=np.float32)
    states.tofile(os.path.join(outdir, 'states'))

    bbox.finish(verbose=True)
Пример #24
0
def run_bbox(verbose=False):
    has_next = 1

    # Prepare environment - Load the game level
    prepare_box()

    while has_next:
        # Get current environment state
        state = bbox.get_state()

        # Choose an action to perform at current step
        action = get_action_by_state(state)

        # Perform chosen action
        # Function do_action(action) returns False if level is finished,
        # Otherwise returns True
        has_next = bbox.do_action(action)

    # Finish the game simulation, print earned reward
    # While submitting solutions make sure you do call finish()
    bbox.finish(verbose=1)
Пример #25
0
def run_bbox(verbose=False):
    has_next = 1
    prepare_bbox()
    # vector of the current state features

    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(memtime,n_f+2))

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks the best choice will be
    attempt = lasagne.layers.get_output(agent)[0]

    #function to do all of the stuff above
    test_fn = theano.function([input_var], attempt)
    # time to check how long it takes to run
    memory = np.zeros(shape=(memtime,n_f+2))
    start = time.time()
    consequence=0
    steps=0
    while has_next:
        memory = forget(memory)
        state = bbox.get_state()
        memory[0][:-2]=state
        choices = test_fn(memory)
        action = np.argmax(choices)
        has_next = bbox.do_action(action)
        score = bbox.get_score()
        consequence=score-consequence
        memory[0][-2:] = [action,consequence]
        steps+=1
        if steps%10000==0:
            score = bbox.get_score()
            print ("Steps: {}".format(steps))
            print ("   current score: {}".format(score))

    print ("Final Score: {}".format(score))
    print ("Time to run: {} seconds".format(time.time()-start))
    bbox.finish(verbose=1)
Пример #26
0
def run_bbox(verbose=False):
    has_next = 1

    prepare_bbox()
    #vector of the current state features
    input_var= T.dvector('in_state')
    input_var= T.reshape(input_var,(1,n_features))
    #vector of the scores for 100 of the same action
    target_var = T.dvector('scores')
    target_var = T.reshape(target_var,(1,n_actions))
    #Load net into the agent object
    agent=prepare_agent(input_var)
    #what the agent thinks will happen if it does each action 100 times
    attempt = lasagne.layers.get_output(agent)
    #how much the agent was wrong, and should be punished
    punish = lasagne.objectives.squared_error(attempt,target_var)
    punish = punish.mean()
    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)
    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(punish,params,learning_rate=.1,momentum=.9)
    #function to do all of the stuff above
    train_fn = theano.function([input_var, target_var], punish, updates=teach,on_unused_input='ignore')
    #time to check how long it takes to run
    start = time.time()
    while has_next:
        state = bbox.get_state()
        r_state= np.reshape(state,(1,n_features))
        scores = get_all_scores(state)
        r_scores = np.reshape(scores,(1,n_actions))
        action = T.argmax(scores)
        error = train_fn(r_state,r_scores)
        print (error)
        has_next = bbox.do_action(action)
 
    print ("Time to run: {} seconds".format(time.time()-start))
    bbox.finish(verbose=1)
Пример #27
0
def learn_bbox(rnet_model, train_data, update_inc=5000,
               lookup_inc=250, seed_data=False):
  """
  Add training instances to train_data from a single run-through of a
  bbox session.
  :param rnet_model: model object with get_lreg_action and get_action
    methods
  :param train_data: DataSet object used to buffer states and append
    new training instances
  :param update_inc: int, number of steps between each nnet model update
  :param lookup_inc: int, number of forward action lookup steps
  :param seed_data: boolean, sets best_action is the action returned by
    the lreg model.
  :return: int, the number of action errors, or differences between
    actions produced by the rnet_model and the ideal or seed model.
  """
  has_next = 1
  error_count = 0
  rand_count = 0
  rand_idx = rand_n

  prepare_bbox()
  # For each new state in the session, add it to the data set's state
  # buffer so that historical states are included in a commit event
  train_data.clear_buffer()
  current_state = bbox.get_state()
  train_data.update_buffer(current_state)

  while has_next:
    # If all random values have been used, generate a new batch
    if rand_idx >= (rand_n-1):
      rand_vals = numpy.random.random_sample(size=(rand_n))
      rand_idx = 0

    step_count = bbox.get_time()
    # Get the next action from the model based on the current set of
    # buffered states
    action = rnet_model.get_action(train_data.get_buffer())

    # Every update_inc steps train the model's network with newly
    # acquired training data
    if step_count % update_inc == 0:
      rn_model.run_training(train_data, max_steps=update_nnet, restore=True)
      error_count = 0
      rand_count = 0
    # If the random value is less than or equal to the sample
    # probability, sample the current session state and determine the
    # best action, adding it to the training set if necessary
    elif rand_vals[rand_idx] <= sample_prob:
      if seed_data:
        best_action = rnet_model.get_lreg_action(current_state)
        score_delta = 0.1
      else:
        best_action, score_delta = action_lookup(rnet_model,
                                                 train_data, lookup_inc)
      if action != best_action:
        train_data.commit_buffer(best_action, score_delta)
        error_count += 1
      rand_count += 1
    # Add random variation to the session by performing a random action
    # if less than or equal to perturb probability
    if rand_vals[rand_idx+1] <= perturb_prob:
      action = numpy.random.randint(0,4)
      step_inc = numpy.random.randint(rand_min, rand_max)
      for _ in xrange(step_inc):
        has_next = bbox.do_action(action)
        current_state = bbox.get_state()
        train_data.update_buffer(current_state)
    else:
      has_next = bbox.do_action(action)
      current_state = bbox.get_state()
      train_data.update_buffer(current_state)

    rand_idx += 2
    if step_count % 5000 == 0:
      print ("time = %d, score = %f" % (step_count, bbox.get_score()))
      print ("errors = %d, samples = %d" % (error_count, rand_count))
      #rn_model.print_stats()

  bbox.finish(verbose=1)
  return error_count
Пример #28
0
def finish():
    """Exits"""
    ci.finish()
    exit()
Пример #29
0
#!/usr/bin/env python3

"""
A minimal bot player.

Loads the level and params and lets the bot act.
"""
from interface import (get_max_time, get_num_of_actions, get_num_of_features,
                       finish, load_level)
from numpy import get_include, load
from pyximport import install

install(setup_args={'include_dirs': get_include()}, reload_support=True)
from bot_wrapper import do_act

if __name__ == '__main__':
    load_level('../levels/train_level.data', verbose=1)
    level = {
        'steps': get_max_time(),
        'actions': get_num_of_actions(),
        'features': get_num_of_features()
    }
    params = dict(load('params.npz'))
    do_act(level, params)
    finish(verbose=1)
Пример #30
0
def run_bbox(verbose=False):
    prepare_bbox()

    # vector of the current state features
    input_var= T.matrix('memory')
    input_var= T.reshape(input_var,(memtime,n_f+2))

    #Score after the agent makes it's choice
    reality = T.scalar('consequence')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks the best choice will be
    attempt = T.max(lasagne.layers.get_output(agent))

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(attempt,reality)

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.1,momentum=0.9)

    #function to do all of the stuff above I DON'T HAVE A TARGET??
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    memory = np.zeros(shape=(memtime,n_f+2))
    start = time.time()
    scores_per_epoch = np.zeros(epochs)
    for epoch in range(epochs):
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox

        #initialize tracking variables
        consequence=0
        self_assessment=0
        steps=0
        trust=0.00
        while has_next:

            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #upload new state, with no score or action chosen
            memory[0][:-2] = state
            if rand.random>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = lasagne.get_output(agent,memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best
            
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            consequence = bbox.get_score()-consequence 
            
            #train on choices just made and memory
            memory[0][-2:]=[action,consequence]
            train_fn(memory,consequence) #train based on the score change
            
            #updating for next loop
            self_assessment += consequence
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                trust = trust+.01
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   self assessment: {}".format(self_assessment))
                print ("   trust: {}".format(trust))
                print ("   current score: {}".format(score))
        #report on model quality on previous epoch
        score = bbox.get_score()
        print ("Epoch: {}".format(epoch))
        print ("Final Score: {}".format(score))
        print ("Time to Run: {} minutes".format((time.time()-e_time)/60))
        scores_per_epoch[epoch] = score

        #reset box for next epoch
        bbox.reset_level()

    print ("All scores per epoch: ")
    print (scores_per_epoch)
    print ("Time to run: {} hours".format((time.time()-start)/3600))
    np.savez('model_mem.npz', *lasagne.layers.get_all_param_values(agent))
    bbox.finish(verbose=1)
Пример #31
0
 def finish(self, verbose):
     bbox.finish(verbose=verbose)
Пример #32
0
    keras_model.add(Dropout(0.5))
    keras_model.add(Dense(agent_env.n_actions, activation="softmax"))
    agent_model = KerasModel(keras_model)

    # experience memory
    agent_mem = ExperienceReplay(memory_length=memory_len)

    # compile agent
    agent = DiscreteAgent(agent_model,
                          agent_mem,
                          epsilon=lambda *args: epsilon)
    # SGD optimizer + MSE cost + MAX policy = Q-learning as we know it
    #agent.compile(optimizer=RMSprop(lr=0.001), loss='mse', policy_rule='max')
    agent.compile(optimizer=RMSprop(lr=0.001),
                  loss='categorical_crossentropy',
                  policy_rule='max')

    # train agent
    agent.learn(agent_env, epoch=epochs, batch_size=batch_size, gamma=gamma)

    # save trained model and weights
    pre = "model-04-slow"
    with open(pre + ".json", 'w') as f:
        json.dump(keras_model.to_json(), f)
    keras_model.save_weights(pre + ".h5", overwrite=True)

    # test agent
    #agent.play(agent_env, epoch=100)

    bbox.finish(verbose=1)
Пример #33
0
def finish():
    interface.finish()
Пример #34
0
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, num_actions, size=1)[0]
            else:
                q = model.predict(input_tm1)
                action = np.argmax(q[0])

            # apply action, get rewards and new state
            input_t, reward, game_over = env.act(action)
            if reward >= 0.:
                win_cnt += 1

            # store experience
            exp_replay.remember([input_tm1, action, reward, input_t], game_over)

            # adapt model
            inputs, targets = exp_replay.get_batch(model, batch_size=batch_size)

            loss += model.train_on_batch(inputs, targets)[0]
        print("Epoch {:03d}/999 | Loss {:.4f} | Win count {}".format(e, loss, win_cnt))

    # save trained model and weights
    with open("model2.json", 'w') as f:
        json.dump(keras_model.to_json(), f)
    keras_model.save_weights("model2.h5", overwrite=True)

    # test agent
    #agent.play(agent_env, epoch=100)

    bbox.finish(verbose=1)

Пример #35
0
#!/usr/bin/env python3
"""
A minimal bot player.

Loads the level and params and lets the bot act.
"""
from interface import (get_max_time, get_num_of_actions, get_num_of_features,
                       finish, load_level)
from numpy import get_include, load
from pyximport import install

install(setup_args={'include_dirs': get_include()}, reload_support=True)
from bot_wrapper import do_act

if __name__ == '__main__':
    load_level('../levels/train_level.data', verbose=1)
    level = {
        'steps': get_max_time(),
        'actions': get_num_of_actions(),
        'features': get_num_of_features()
    }
    params = dict(load('params.npz'))
    do_act(level, params)
    finish(verbose=1)
Пример #36
0
def learn_bbox(rnet_model,
               train_data,
               update_inc=5000,
               lookup_inc=250,
               seed_data=False):
    """
  Add training instances to train_data from a single run-through of a
  bbox session.
  :param rnet_model: model object with get_lreg_action and get_action
    methods
  :param train_data: DataSet object used to buffer states and append
    new training instances
  :param update_inc: int, number of steps between each nnet model update
  :param lookup_inc: int, number of forward action lookup steps
  :param seed_data: boolean, sets best_action is the action returned by
    the lreg model.
  :return: int, the number of action errors, or differences between
    actions produced by the rnet_model and the ideal or seed model.
  """
    has_next = 1
    error_count = 0
    rand_count = 0
    rand_idx = rand_n

    prepare_bbox()
    # For each new state in the session, add it to the data set's state
    # buffer so that historical states are included in a commit event
    train_data.clear_buffer()
    current_state = bbox.get_state()
    train_data.update_buffer(current_state)

    while has_next:
        # If all random values have been used, generate a new batch
        if rand_idx >= (rand_n - 1):
            rand_vals = numpy.random.random_sample(size=(rand_n))
            rand_idx = 0

        step_count = bbox.get_time()
        # Get the next action from the model based on the current set of
        # buffered states
        action = rnet_model.get_action(train_data.get_buffer())

        # Every update_inc steps train the model's network with newly
        # acquired training data
        if step_count % update_inc == 0:
            rn_model.run_training(train_data,
                                  max_steps=update_nnet,
                                  restore=True)
            error_count = 0
            rand_count = 0
        # If the random value is less than or equal to the sample
        # probability, sample the current session state and determine the
        # best action, adding it to the training set if necessary
        elif rand_vals[rand_idx] <= sample_prob:
            if seed_data:
                best_action = rnet_model.get_lreg_action(current_state)
                score_delta = 0.1
            else:
                best_action, score_delta = action_lookup(
                    rnet_model, train_data, lookup_inc)
            if action != best_action:
                train_data.commit_buffer(best_action, score_delta)
                error_count += 1
            rand_count += 1
        # Add random variation to the session by performing a random action
        # if less than or equal to perturb probability
        if rand_vals[rand_idx + 1] <= perturb_prob:
            action = numpy.random.randint(0, 4)
            step_inc = numpy.random.randint(rand_min, rand_max)
            for _ in xrange(step_inc):
                has_next = bbox.do_action(action)
                current_state = bbox.get_state()
                train_data.update_buffer(current_state)
        else:
            has_next = bbox.do_action(action)
            current_state = bbox.get_state()
            train_data.update_buffer(current_state)

        rand_idx += 2
        if step_count % 5000 == 0:
            print("time = %d, score = %f" % (step_count, bbox.get_score()))
            print("errors = %d, samples = %d" % (error_count, rand_count))
            #rn_model.print_stats()

    bbox.finish(verbose=1)
    return error_count
Пример #37
0
def run_bbox(verbose=False):
    prepare_bbox()
    # vector of the current state features
    input_var= T.tensor3('memory')
    input_var= T.reshape(input_var,(memtime,1,n_f+2))

    #Score after the agent makes it's choice
    reality = T.vector('score_diffs')

    #Load net into the agent object
    agent=prepare_agent(input_var)

    #What the agent thinks their best choice is this event
    evaluation = lasagne.layers.get_output(agent)[0]

    #how much the agent should be rewarded/punished
    reward = lasagne.objectives.squared_error(evaluation,reality)
    reward = reward.mean()

    #get the parameters for updating
    params = lasagne.layers.get_all_params(agent,trainable=True)

    #update the net with the error
    teach = lasagne.updates.nesterov_momentum(reward,params,learning_rate=0.01,momentum=0.9)

    #A function to get the agent's choice of what to try this time
    decide_fn = theano.function([input_var],evaluation)

    #function to do all of the stuff above
    train_fn = theano.function([input_var,reality], reward, updates=teach,on_unused_input='ignore')

    # time to check how long it takes to run
    start = time.time()
    for epoch in range(epochs):
        memory = np.zeros(shape=(memtime,1,n_f+2))
        e_time = time.time() #time for this epoch
        has_next = 1 #looping variable, state of bbox
        #initialize tracking variables
        consequence=error=0
        steps=0
        trust=0.00+.02*epoch
        good=0
        while has_next:
            #Updating memory matrix, forgetting a state, making room
            memory = forget(memory) 
            state = bbox.get_state()
            #get best action based on 100 step checkpoint method
            actuals = get_all_score_diffs(state)
            #upload new state, with no score or action chosen
            memory[0][0][:-2] = state
            if rand.random()>trust:
                action = rand.randint(0,n_a-1) #if trust is too low still, random action
            else:
                choices = decide_fn(memory) #Otherwise, let the agent decide. 
                action = np.argmax(choices) #pick action agent thinks is best


            if action == np.argmax(actuals):
                good = good+1
            #do it, and find out the consequences (if the score improved or went down)
            has_next = bbox.do_action(action)
            #find consequenquence
            score = bbox.get_score()
            consequence=score-consequence
            #train on choices just made and memory
            memory[0][0][-2:]=[action,consequence]

            error += train_fn(memory,actuals) #train based on the score change

            #updating for next loop
            steps += 1

            #occasionally check in on progress
            if steps%10000==0:
                score = bbox.get_score()
                print ("Epoch: {}".format(epoch))
                print ("Steps: {}".format(steps))
                print ("   current trust: {}".format(trust))
                print ("   avg error: {}".format(error/steps))
                print ("   bad choices: {}%".format(100-float(good)/100))
                print ("   current score: {}".format(score))
                if trust<.95:
                    trust = trust+.02
                bbox.clear_all_checkpoints()
                ch=ra=good=0

        #report on model quality on previous epoch
        score = bbox.get_score()
        with open("epoch_data.txt","a") as f:
        	f.write("Epoch: {}    Final Score: {}    Average Error: {}    Time to Run: {} min\n".format(epoch,score,error/steps,(time.time()-e_time)/60))
        #save model parameters
        np.savez('model_LSTM_cost.npz', *lasagne.layers.get_all_param_values(agent))
        #reset box for next epoch
        if(epoch<epochs-1):
            bbox.reset_level()

    print ("Time to run: {} hours".format((time.time()-start)/3600))
    bbox.finish(verbose=1)
Пример #38
0
def finish():
    interface.finish()
Пример #39
0
 def finish(self, verbose):
     bbox.finish(verbose=verbose)
Пример #40
0
def run_bbox(verbose=False, epsilon=0.1, gamma=0.99, action_repeat=4, update_frequency=4, batchSize=32, buffer=100000, load_weights=False, save_weights=False):
    has_next = 1
    
    # Prepare environment - load the game level
    prepare_bbox()
    
    update_frequency_cntr = 0
    replay = []
    h=0
    if load_weights:
        model.load_weights('my_model_weights.h5')
        model_prim.load_weights('my_model_weights.h5')
    #stores tuples of (S, A, R, S')
 
    while has_next:
        # Get current environment state
        state = copy.copy(bbox.get_state())
        prev_reward = copy.copy(bbox.get_score())
        
        #Run the Q function on S to get predicted reward values on all the possible actions
        qval = model.predict(state.reshape(1,n_features), batch_size=1)
 
        # Choose an action to perform at current step
        if random.random() < epsilon: #choose random action or best action
            if random.random() < 0.5:
                action = np.random.randint(0,n_actions) #assumes 4 different actions
            else: # Use checkpoints to prime network with good actions
                action_range=50 #random.randint(1,200)
                action = calc_best_action_using_checkpoint(action_range=action_range)
                #for _ in range(action_range):
                #    has_next = bbox.do_action(action)
        else: #choose best action from Q(s,a) values
            action = (np.argmax(qval))


        # Perform chosen action, observe new state S'
        # Function do_action(action) returns False if level is finished, otherwise returns True.
        for a in range(action_repeat):
            has_next = bbox.do_action(action)
        new_state = copy.copy(bbox.get_state())
        reward = copy.copy(bbox.get_score()) - prev_reward
        #reward = 1.0 if reward > 0.0 else -1.0 #this gives better than random when combined with a small network

        #Experience replay storage
        if (len(replay) < buffer): #if buffer not filled, add to it
            replay.append((state, action, reward, new_state))
        else: #if buffer full, overwrite old values
            if (h < (buffer-1)):
                h += 1
            else:
                h = 0
            replay[h] = (state, action, reward, new_state)

            #randomly sample our experience replay memory
            minibatch = random.sample(replay, batchSize)
            X_train = []
            y_train = []
            for memory in minibatch:
                #Get max_Q(S',a)
                old_state, action, reward, new_state = memory
                old_qval = model.predict(old_state.reshape(1,n_features), batch_size=1)
                newQ = model.predict(new_state.reshape(1,n_features), batch_size=1)
                maxQ = np.max(newQ)
                y = np.zeros((1,n_actions))
                y[:] = old_qval[:]
                if has_next == 1: #non-terminal state
                    update = (reward + (gamma * maxQ))
                else: #terminal state
                    update = reward
                y[0][action] = update
                X_train.append(old_state)
                y_train.append(y.reshape(n_actions,))

            X_train = np.array(X_train)
            y_train = np.array(y_train)
            # update the weights of a copy of the network
            model_prim.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0)
            if update_frequency_cntr >= update_frequency:
                prim_weights = model_prim.get_weights()
                print('model update')
                model.set_weights(prim_weights)
                update_frequency_cntr = 0
            update_frequency_cntr += 1

        if bbox.get_time() % 500000 == 0:
            print ("time = %d, score = %f" % (bbox.get_time(), bbox.get_score()))


    # Finish the game simulation, print earned reward and save weights
    if save_weights:
        model_prim.save_weights('my_model_weights.h5', overwrite=True)
    bbox.finish(verbose=1)
Пример #41
0
def run_bbox(verbose=False):
    bbox.load_level("../levels/train_level.data", verbose=True)

    states, actions, scores, rewards = [], [], [], []
    utility_models = [
        SGDRegressor(learning_rate='constant',
                     #penalty='elasticnet',
                     ) for _ in range(n_actions)
    ]
    zero_utilities = np.zeros([n_actions])

    n_past_act = 1
    n_past_st = 0  # in addition to current
    discount = 0.9
    random_steps = 10000

    step = 0
    has_next = 1
    while has_next:
        step += 1
        state = bbox.get_state()
        utilities = zero_utilities
        # Choose action using current utility_models
        if step > random_steps:
            clf_state = np.concatenate(states[-n_past_st:] + [state]) \
                        if n_past_st else state
            try:
                utilities = np.array(
                    [m.predict([clf_state])[0] for m in utility_models])
            except NotFittedError:
                pass
    #utilities -= utilities.min()
    #p = None if np.isclose(utilities, 0).all() else \
    #    utilities / utilities.sum()
        if np.random.rand() < 0.1 or step <= random_steps:
            action = np.random.choice(n_actions)
        else:
            action = np.argmax(utilities)
        # Do action and bookkeeping
        has_next = bbox.do_action(action)
        states.append(np.array(state))
        actions.append(action)
        score = bbox.get_score()
        rewards.append(score if not scores else (score - scores[-1]))
        scores.append(score)
        # Train classifiers
        if len(rewards) >= n_past_act + n_past_st:
            total_reward = sum(r * np.power(discount, i)
                               for i, r in enumerate(rewards[-n_past_act:]))
            if n_past_act == 1:
                clf_state = np.concatenate(states[-(n_past_act + n_past_st):])
            else:
                clf_state = np.concatenate(
                    states[-(n_past_act + n_past_st):-n_past_act + 1])
            utility_models[actions[-n_past_act]].partial_fit([clf_state],
                                                             [total_reward])
        if verbose and step % 1000 == 0:
            print(step, score)

    i = 1
    get_outdir = 'run_{}'.format
    outdir = get_outdir(i)
    while os.path.exists(outdir):
        i += 1
        outdir = get_outdir(i)
    os.mkdir(outdir)
    print('saving to {}'.format(outdir))
    scores = np.array(scores, dtype=np.float32)
    scores.tofile(os.path.join(outdir, 'scores'))
    actions = np.array(actions, dtype=np.int8)
    actions.tofile(os.path.join(outdir, 'actions'))
    states = np.array(states, dtype=np.float32)
    states.tofile(os.path.join(outdir, 'states'))

    bbox.finish(verbose=True)