Python QNetwork.get_loss_and_updates примеры использования

Язык программирования: Python

Пространство имен/Пакет: qnetwork

Класс/Тип: QNetwork

Метод/Функция: get_loss_and_updates

Примеров на hotexamples.com: 5

Python QNetwork.get_loss_and_updates - 5 примеров найдено. Это лучшие примеры Python кода для qnetwork.QNetwork.get_loss_and_updates, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

QNetwork(20)

parameters(7)

eval(4)

load_state_dict(4)

get_loss_and_updates(3)

state_dict(3)

train(3)

frozen_layers(2)

get_action(2)

get_extended_state(2)

cuda(1)

forward(1)

fprop(1)

get_params(1)

get_weight_magnitude(1)

Пример #1

Показать файл

Файл: test_nnet_on_mdps.py Проект: strategic-zjc/Data-Science-Term-Project

def test_nnet_numberline_mdp(n_episodes,
                             exploration_prob=0.9,
                             learning_rate=.0005,
                             target_freeze_period=500):

    reduce_explore = 0.0001
    size = 20.
    mdp = GridSearchMDP(size)
    actions = mdp.actions(mdp.startState())

    print actions

    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')

    n_vis = 2  # for chain mdp
    hidden_layer_1 = HiddenLayer(n_vis=n_vis,
                                 n_hid=len(actions),
                                 layer_name='hidden',
                                 activation='tanh')
    output_layer = OutputLayer(layer_name='out', activation='relu')
    layers = [hidden_layer_1, output_layer]
    mlp = QNetwork(layers,
                   discount=mdp.discount(),
                   learning_rate=learning_rate)
    loss, updates = mlp.get_loss_and_updates(features, action, reward,
                                             next_features)

    train_model = theano.function([
        theano.Param(features, default=np.zeros(MAX_FEATURES_TEST)),
        theano.Param(action, default=0),
        theano.Param(reward, default=0),
        theano.Param(next_features, default=np.zeros(MAX_FEATURES_TEST))
    ],
                                  outputs=loss,
                                  updates=updates,
                                  mode='FAST_RUN')

    rewards = []
    counter = 0
    for episode in xrange(n_episodes):
        curDiscount = mdp.discount()
        totalReward = 0
        cur_state = mdp.startState()
        print cur_state
        while not mdp.isEnd(cur_state):
            counter += 1
            if counter % 1000 == 0:
                mlp.frozen_layers = copy.deepcopy(mlp.layers)

            if (counter % 100 == 0):
                print 'cur_state: {}'.format(cur_state)

            if random.random() < exploration_prob:
                action = random.choice(actions)
                action_index = actions.index(action)
            else:
                action_index = T.argmax(
                    mlp.fprop([cur_state[0] / mdp.n,
                               cur_state[1] / mdp.n])).eval()
                action = actions[action_index]
                if (counter % 100 == 0):
                    print 'action: {}'.format(action)
            # realAction = action
            # if action == 0: realAction = -1
            transitions = mdp.succAndProbReward(
                cur_state, action)  # previously realAction)
            if len(transitions) == 0:
                break
            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            #print 'newState: {}'.format(newState)
            #print 'reward: {}'.format(reward)
            #print [(p.eval(), p.name) for p in mlp.get_params()]
            #print [(p.eval(), p.name) for p in mlp.get_params(freeze=True)]
            #print '\n'
            reward *= curDiscount
            totalReward += reward
            curDiscount *= mdp.discount()
            loss = train_model([cur_state[0] / mdp.n, cur_state[1] / mdp.n],
                               action_index, reward,
                               [newState[0] / mdp.n, newState[1] / mdp.n
                                ])  # previously action
            cur_state = newState
            exploration_prob -= reduce_explore
            if (exploration_prob < 0.25):
                exploration_prob = 0.25
        rewards.append(totalReward)

        print('*' * 30)
        print('episode: {} ended with score: {}'.format(episode, rewards[-1]))
        print('avg reward: {}'.format(np.mean(rewards[-25:])))
        print('explore: {}'.format(exploration_prob))
        print('*' * 30)
        print('\n')

    return rewards

Пример #2

Показать файл

Файл: simulate.py Проект: wulfebw/reinforcement_learning

def simulate_symbolic_online_RL_algorithm(mdp, num_episodes, max_iterations):
    
    real_actions = mdp.actions(None)
    actions = np.arange(len(real_actions))

    # these theano variables are used to define the symbolic input of the network
    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')
    learning_rate_symbol = T.dscalar('learning_rate')

    h1 = HiddenLayer(n_vis=INPUT_DIM,  n_hid=HIDDEN_DIM, layer_name='h1')
    h2 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h2')
    h3 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h3')
    h4 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h4')
    # h5 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h5')
    # h6 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h6')
        
    layers = [h1, h2, h3, h4] #, h3, h4, h5, h6]
    learning_rate = 1e-2
    explorationProb = .4
    regularization_weight = 1e-5
    momentum_rate = 9e-1
    qnetwork = QNetwork(layers, discount=mdp.discount,  
                            momentum_rate=momentum_rate,
                            regularization_weight=regularization_weight)

    exploration_reduction = (explorationProb - MIN_EXPLORATION_PROB) / num_episodes
    learning_rate_reduction = (learning_rate - MIN_LEARNING_RATE) / num_episodes

    # this call gets the symbolic output of the network along with the parameter updates
    loss, updates = qnetwork.get_loss_and_updates(features, action, reward, next_features, learning_rate_symbol)

    print 'Building Training Function...'
    # this defines the theano symbolic function used to train the network
    # 1st argument is a list of inputs, here the symbolic variables above
    # 2nd argument is the symbolic output expected
    # 3rd argument is the dictionary of parameter updates
    # 4th argument is the compilation mode
    train_model = theano.function(
                    [theano.Param(features, default=np.zeros(INPUT_DIM)),
                    theano.Param(action, default=0),
                    theano.Param(reward, default=0),
                    theano.Param(next_features, default=np.zeros(HIDDEN_DIM)),
                    learning_rate_symbol],
                    outputs=loss,
                    updates=updates,
                    mode='FAST_RUN')

    get_action = theano.function([features], qnetwork.get_action(features))

    total_rewards = []
    total_losses = []
    weight_magnitudes = []

    print 'Starting Training...'
    replay_mem = replay_memory.ReplayMemory()
    for episode in xrange(num_episodes):
        
        state = np.array(mdp.start_state)
        total_reward = 0
        total_loss = 0
        for iteration in xrange(max_iterations):
            
            if random.random() < explorationProb:
                action = random.choice(actions)
            else:
                action = get_action(state)

            real_action = real_actions[action]
            transitions = mdp.succAndProbReward(state, real_action)

            if len(transitions) == 0:
                # loss += train_model(state, action, 0, next_features)
                break

            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            newState = np.array(newState)

            sars_tuple = (state, action, np.clip(reward,-1,1), newState)
            replay_mem.store(sars_tuple)
            num_samples = 5 if replay_mem.isFull() else 1
            for i in range(0, num_samples):
                random_train_tuple = replay_mem.sample()
                sample_state = random_train_tuple[0]
                sample_action = random_train_tuple[1]
                sample_reward = random_train_tuple[2]
                sample_new_state = random_train_tuple[3]

                total_loss += train_model(sample_state, sample_action, sample_reward, sample_new_state, learning_rate)

            total_reward += reward
            state = newState

        explorationProb -= exploration_reduction
        learning_rate -= learning_rate_reduction


        total_rewards.append(total_reward * mdp.discount ** iteration)
        total_losses.append(total_loss)
        weight_magnitude = qnetwork.get_weight_magnitude()
        weight_magnitudes.append(weight_magnitude)

        print 'episode: {}\t\t loss: {}\t\t reward: {}\t\tweight magnitude: {}'.format(episode, round(total_loss, 2), total_reward, weight_magnitude)

    # return the list of rewards attained
    return total_rewards, total_losses

Пример #3

Показать файл

Файл: test_nnet_on_mdps.py Проект: switchfootsid/playing_atari

def test_nnet_numberline_mdp(n_episodes,  exploration_prob=0.9, learning_rate=.0005, target_freeze_period=500):

    reduce_explore = 0.0001
    size = 20.
    mdp = GridSearchMDP(size)
    actions = mdp.actions(mdp.startState())

    print actions

    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')

    n_vis = 2 # for chain mdp
    hidden_layer_1 = HiddenLayer(n_vis=n_vis, n_hid=len(actions), layer_name='hidden', activation='tanh')
    output_layer = OutputLayer(layer_name='out', activation='relu')
    layers = [hidden_layer_1, output_layer]
    mlp = QNetwork(layers, discount=mdp.discount(), learning_rate=learning_rate)
    loss, updates = mlp.get_loss_and_updates(features, action, reward, next_features)

    train_model = theano.function(
                    [theano.Param(features, default=np.zeros(MAX_FEATURES_TEST)),
                    theano.Param(action, default=0),
                    theano.Param(reward, default=0),
                    theano.Param(next_features, default=np.zeros(MAX_FEATURES_TEST))],
                    outputs=loss,
                    updates=updates,
                    mode='FAST_RUN')

    rewards = []
    counter = 0
    for episode in xrange(n_episodes):
	curDiscount = mdp.discount()
	totalReward = 0
        cur_state = mdp.startState()
	print cur_state
        while not mdp.isEnd(cur_state):
            counter += 1
            if counter % 1000 == 0:
                mlp.frozen_layers = copy.deepcopy(mlp.layers)

	    if (counter % 100 == 0):
            	print 'cur_state: {}'.format(cur_state)

            if random.random() < exploration_prob: 
                action = random.choice(actions)
                action_index = actions.index(action)
            else:
                action_index = T.argmax(mlp.fprop([cur_state[0]/mdp.n, cur_state[1]/mdp.n])).eval()
                action = actions[action_index]
		if (counter % 100 == 0):
               	    print 'action: {}'.format(action)
            # realAction = action
            # if action == 0: realAction = -1
            transitions = mdp.succAndProbReward(cur_state, action) # previously realAction)
            if len(transitions) == 0:
                break
            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            #print 'newState: {}'.format(newState)
            #print 'reward: {}'.format(reward)
            #print [(p.eval(), p.name) for p in mlp.get_params()]
            #print [(p.eval(), p.name) for p in mlp.get_params(freeze=True)]
            #print '\n'
	    reward *= curDiscount
	    totalReward += reward
	    curDiscount *= mdp.discount()
            loss = train_model([cur_state[0]/mdp.n, cur_state[1]/mdp.n], action_index, reward, [newState[0]/mdp.n, newState[1]/mdp.n]) # previously action
            cur_state = newState
	    exploration_prob -= reduce_explore
	    if (exploration_prob < 0.25):
		exploration_prob = 0.25
        rewards.append(totalReward)
	
        print('*' * 30)
        print('episode: {} ended with score: {}'.format(episode, rewards[-1]))
	print('avg reward: {}'.format(np.mean(rewards[-25:])))
	print('explore: {}'.format(exploration_prob))
        print('*' * 30)
        print('\n')
        
    return rewards

Пример #4

Показать файл

def train(gamepath, n_episodes,  display_screen,  record_weights,  reduce_exploration_prob_amount, n_frames_to_skip, exploration_prob, verbose, discount, learning_rate, load_weights, frozen_target_update_period, use_replay_mem):
    """
    :description: trains an agent to play a game 

    :type gamepath: string 
    :param gamepath: path to the binary of the game to be played

    :type n_episodes: int 
    :param n_episodes: number of episodes of the game on which to train

    display_screen : whether or not to display the screen of the game 
    
    record_weights : whether or not to save the weights of the nextwork
    
    reduce_exploration_prob_amount : amount to reduce exploration prob each episode
                                     to not reduce exploration_prob set to 0
    
    n_frames_to_skip : how frequently to determine a new action to use
    
    exploration_prob : probability of choosing a random action
    
    verbose : whether or not to print information about the run periodically
    
    discount : discount factor used in learning 
    
    learning_rate : the scaling factor for the sgd update
    
    load_weights : whether or not to load weights for the network (set the files directly below)
    
    frozen_target_update_period : the number of episodes between reseting the target of the network
    """

    # load the ale interface to interact with
    ale = ALEInterface()
    ale.setInt('random_seed', 42)

    # display/recording settings, doesn't seem to work currently
    recordings_dir = './recordings/breakout/'
    # previously "USE_SDL"
    if display_screen:
        if sys.platform == 'darwin':
            import pygame
            pygame.init()
            ale.setBool('sound', False) # Sound doesn't work on OSX
            #ale.setString("record_screen_dir", recordings_dir);
        elif sys.platform.startswith('linux'):
            ale.setBool('sound', True)
        ale.setBool('display_screen', True)

    ale.loadROM(gamepath)
    ale.setInt("frame_skip", n_frames_to_skip)
    # real actions for breakout are [0,1,3,4]
    real_actions = ale.getMinimalActionSet()

    # use a list of actions [0,1,2,3] to index into the array of real actions
    actions = np.arange(len(real_actions))

    # these theano variables are used to define the symbolic input of the network
    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')

    # load weights by file name
    # currently must be loaded by individual hidden layers
    if load_weights:
        hidden_layer_1 = file_utils.load_model('weights/hidden0_replay.pkl')
        hidden_layer_2 = file_utils.load_model('weights/hidden1_replay.pkl')
    else:
        # defining the hidden layer network structure
        # the n_hid of a prior layer must equal the n_vis of a subsequent layer
        # for q-learning the output layer must be of len(actions)
        hidden_layer_1 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, 
            n_hid=NNET_INPUT_DIMENSION, layer_name='hidden1', activation='relu')
        hidden_layer_2 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, 
            n_hid=NNET_INPUT_DIMENSION, layer_name='hidden2', activation='relu')
    hidden_layer_3 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, 
            n_hid=len(actions), layer_name='hidden3', activation='relu') 
    # the output layer is currently necessary when using tanh units in the
    # hidden layer in order to prevent a theano warning
    # currently the relu unit setting of the hidden and output layers is leaky w/ alpha=0.01
    output_layer = OutputLayer(layer_name='output', activation='relu')

    # pass a list of layers to the constructor of the network (here called "mlp")
    layers = [hidden_layer_1, hidden_layer_2, hidden_layer_3, output_layer]
    qnetwork = QNetwork(layers, discount=discount, learning_rate=learning_rate)

    # this call gets the symbolic output of the network
    # along with the parameter updates expected
    loss, updates = qnetwork.get_loss_and_updates(features, action, reward, next_features)

    # this defines the theano symbolic function used to train the network
    # 1st argument is a list of inputs, here the symbolic variables above
    # 2nd argument is the symbolic output expected
    # 3rd argument is the dictionary of parameter updates
    # 4th argument is the compilation mode
    train_model = theano.function(
                    [theano.Param(features, default=np.zeros(NNET_INPUT_DIMENSION)),
                    theano.Param(action, default=0),
                    theano.Param(reward, default=0),
                    theano.Param(next_features, default=np.zeros(NNET_INPUT_DIMENSION))],
                    outputs=loss,
                    updates=updates,
                    mode='FAST_RUN')

    sym_action = qnetwork.get_action(features)
    get_action = theano.function([features], sym_action)

    # some containers for collecting information about the training processes 
    rewards = []
    losses = []
    best_reward = 4
    sequence_examples = []
    sampled_examples = []

    # the preprocessor and feature extractor to use
    preprocessor = screen_utils.RGBScreenPreprocessor()
    feature_extractor = feature_extractors.NNetOpenCVBoundingBoxExtractor(max_features=MAX_FEATURES)

    if use_replay_mem:
        replay_mem = ReplayMemory()
    # main training loop, each episode is a full playthrough of the game
    for episode in xrange(n_episodes):

        # this implements the frozen target component of the network
        # by setting the frozen layers of the network to a copy of the current layers
        if episode % frozen_target_update_period == 0:
            qnetwork.frozen_layers = copy.deepcopy(qnetwork.layers)


        # some variables for collecting information about this particular run of the game
        total_reward = 0
        action = 1
        counter = 0
        reward = 0
        loss = 0
        previous_param_0 = None

        # lives here is used for the reward heuristic of subtracting 1 from the reward 
        # when we lose a life. currently commented out this functionality because
        # i think it might not be helpful.
        lives = ale.lives()

        # the initial state of the screen and state
        screen = np.zeros((preprocessor.dim, preprocessor.dim, preprocessor.channels))
        state = { "screen" : screen, "objects" : None, "prev_objects": None, "features": np.zeros(MAX_FEATURES)}
        
        # start the actual play through of the game
        while not ale.game_over():
            counter += 1

            # get the current features, which is the representation of the state provided to 
            # the "agent" (here just the network directly)
            features = state["features"]

            # epsilon greedy action selection (note that exploration_prob is reduced by
            # reduce_exploration_prob_amount after every game)
            if random.random() < exploration_prob: 
                action = random.choice(actions)
            else:
                # to choose an action from the network, we fprop 
                # the current state and take the argmax of the output
                # layer (i.e., the action that corresponds to the 
                # maximum q value)
                action = get_action(features)

            # take the action and receive the reward
            reward += ale.act(real_actions[action])

            # this is commented out because i think it might not be helpful
            if ale.lives() < lives: 
                 lives = ale.lives()
                 reward -= 1


            # get the next screen, preprocess it, initialize the next state
            next_screen = ale.getScreenRGB()
            next_screen = preprocessor.preprocess(next_screen)
            next_state = {"screen": next_screen, "objects": None, "prev_objects": state["objects"]}

            # get the features for the next state
            next_features = feature_extractor(next_state, action=None)

            if use_replay_mem:
                sars_tuple = (features, action, reward, next_features)
                replay_mem.store(sars_tuple)
                num_samples = 5 if replay_mem.isFull() else 1
                for i in range(0, num_samples):
                    random_train_tuple = replay_mem.sample()
                    loss += train_model(*random_train_tuple)

                # collect for pca
                sequence_examples.append(list(sars_tuple[0]) + [sars_tuple[1]] \
                         + [sars_tuple[2]] + sars_tuple[3])
                sequence_examples = sequence_examples[-100:]
                sampled_examples.append(list(random_train_tuple[0]) + [random_train_tuple[1]] \
                        + [random_train_tuple[2]] + random_train_tuple[3])
                sampled_examples = sampled_examples[-100:]
            else:
                # call the train model function
                loss += train_model(features, action, reward, next_features)
            # prepare for the next loop through the game
            next_state["features"] = next_features
            state = next_state
                
            # weird counter value to avoid interaction with any other counter
            # loop that might be added, not necessary right now
            if verbose and counter % PRINT_TRAINING_INFO_PERIOD == 0:
                print('*' * 15 + ' training information ' + '*' * 15) 
                print('episode: {}'.format(episode))
                print('reward: \t{}'.format(reward))
                print('avg reward: \t{}'.format(np.mean(rewards)))
                print 'avg reward (last 25): \t{}'.format(np.mean(rewards[-NUM_EPISODES_AVERAGE_REWARD_OVER:]))
                print('action: \t{}'.format(real_actions[action]))
                print('exploration prob: {}'.format(exploration_prob))
                
                param_info = [(p.eval(), p.name) for p in qnetwork.get_params()]
                for index, (val, name) in enumerate(param_info):
                    if previous_param_0 is None and index == 0:
                        previous_param_0 = val
                    print('parameter {} value: \n{}'.format(name, val))
                    if index == 0:
                        diff = val - previous_param_0
                        print('difference from previous param {}: \n{}'.format(name, diff))

                print('features: \t{}'.format(features))
                print('next_features: \t{}'.format(next_features))

                scaled_sequence = preprocessing.scale(np.array(sequence_examples))
                scaled_sampled = preprocessing.scale(np.array(sampled_examples))
                pca = PCA()
                _ = pca.fit_transform(scaled_sequence)
                print('variance explained by first component for sequence: {}%'.format(pca. \
                    explained_variance_ratio_[0] * 100))
                _ = pca.fit_transform(scaled_sampled)
                print('variance explained by first component for sampled: {}%'.format(pca. \
                    explained_variance_ratio_[0] * 100))

                print('*' * 52)
                print('\n')

            # collect info and total reward and also reset the reward to 0 if we reach this point
            total_reward += reward
            reward = 0
        # collect stats from this game run    
        losses.append(loss)
        rewards.append(total_reward)
    
        # if we got a best reward, inform the user 
        if total_reward > best_reward:
            best_reward = total_reward
            print("best reward!: {}".format(total_reward))

        # record the weights if record_weights=True
        # must record the weights of the indiviual layers
        # only save hidden layers b/c output layer does not have weights
        if episode != 0 and episode % RECORD_WEIGHTS_PERIOD == 0 and record_weights:
            file_utils.save_rewards(rewards)
            file_utils.save_model(qnetwork.layers[0], 'weights/hidden0_{}.pkl'.format(episode))
            file_utils.save_model(qnetwork.layers[1], 'weights/hidden1_{}.pkl'.format(episode))

        # reduce exploration policy over time
        if exploration_prob > MINIMUM_EXPLORATION_EPSILON:
            exploration_prob -= reduce_exploration_prob_amount
        
        # inform user of how the episode went and reset the game
        print('episode: {} ended with score: {}\tloss: {}'.format(episode, rewards[-1], losses[-1]))
        ale.reset_game()

    # return the list of rewards attained
    return rewards

Пример #5

Показать файл

def simulate_symbolic_online_RL_algorithm(mdp, num_episodes, max_iterations):

    real_actions = mdp.actions(None)
    actions = np.arange(len(real_actions))

    # these theano variables are used to define the symbolic input of the network
    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')
    learning_rate_symbol = T.dscalar('learning_rate')

    h1 = HiddenLayer(n_vis=INPUT_DIM, n_hid=HIDDEN_DIM, layer_name='h1')
    h2 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h2')
    h3 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h3')
    h4 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h4')
    # h5 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h5')
    # h6 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h6')

    layers = [h1, h2, h3, h4]  #, h3, h4, h5, h6]
    learning_rate = 1e-2
    explorationProb = .4
    regularization_weight = 1e-5
    momentum_rate = 9e-1
    qnetwork = QNetwork(layers,
                        discount=mdp.discount,
                        momentum_rate=momentum_rate,
                        regularization_weight=regularization_weight)

    exploration_reduction = (explorationProb -
                             MIN_EXPLORATION_PROB) / num_episodes
    learning_rate_reduction = (learning_rate -
                               MIN_LEARNING_RATE) / num_episodes

    # this call gets the symbolic output of the network along with the parameter updates
    loss, updates = qnetwork.get_loss_and_updates(features, action, reward,
                                                  next_features,
                                                  learning_rate_symbol)

    print 'Building Training Function...'
    # this defines the theano symbolic function used to train the network
    # 1st argument is a list of inputs, here the symbolic variables above
    # 2nd argument is the symbolic output expected
    # 3rd argument is the dictionary of parameter updates
    # 4th argument is the compilation mode
    train_model = theano.function([
        theano.Param(features, default=np.zeros(INPUT_DIM)),
        theano.Param(action, default=0),
        theano.Param(reward, default=0),
        theano.Param(next_features, default=np.zeros(HIDDEN_DIM)),
        learning_rate_symbol
    ],
                                  outputs=loss,
                                  updates=updates,
                                  mode='FAST_RUN')

    get_action = theano.function([features], qnetwork.get_action(features))

    total_rewards = []
    total_losses = []
    weight_magnitudes = []

    print 'Starting Training...'
    replay_mem = replay_memory.ReplayMemory()
    for episode in xrange(num_episodes):

        state = np.array(mdp.start_state)
        total_reward = 0
        total_loss = 0
        for iteration in xrange(max_iterations):

            if random.random() < explorationProb:
                action = random.choice(actions)
            else:
                action = get_action(state)

            real_action = real_actions[action]
            transitions = mdp.succAndProbReward(state, real_action)

            if len(transitions) == 0:
                # loss += train_model(state, action, 0, next_features)
                break

            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            newState = np.array(newState)

            sars_tuple = (state, action, np.clip(reward, -1, 1), newState)
            replay_mem.store(sars_tuple)
            num_samples = 5 if replay_mem.isFull() else 1
            for i in range(0, num_samples):
                random_train_tuple = replay_mem.sample()
                sample_state = random_train_tuple[0]
                sample_action = random_train_tuple[1]
                sample_reward = random_train_tuple[2]
                sample_new_state = random_train_tuple[3]

                total_loss += train_model(sample_state, sample_action,
                                          sample_reward, sample_new_state,
                                          learning_rate)

            total_reward += reward
            state = newState

        explorationProb -= exploration_reduction
        learning_rate -= learning_rate_reduction

        total_rewards.append(total_reward * mdp.discount**iteration)
        total_losses.append(total_loss)
        weight_magnitude = qnetwork.get_weight_magnitude()
        weight_magnitudes.append(weight_magnitude)

        print 'episode: {}\t\t loss: {}\t\t reward: {}\t\tweight magnitude: {}'.format(
            episode, round(total_loss, 2), total_reward, weight_magnitude)

    # return the list of rewards attained
    return total_rewards, total_losses