示例#1
0
def runIsolation(FEN, features):
    
    # make it white to play.
    # complete FEN.
    FEN = FEN + " " + "b" + " - - 0 1"
    board = chess.Board(FEN)
#    expectVal = syzygy.probe_wdl(board)
    print(board)
    print features(board)
示例#2
0
def main(opts):

    param = parse_parameters(opts)  # get parameters from command

    xtrain, xdev, test = read_datsets(
        param)  # loading datsets as lists of document objects
    feats = features(
        xtrain
    )  # creating an object from the class features to initialize important global variables such as lexicons and training ds
    #select_features(xtrain, feats)  # feature selection and importance

    train_pipeline = construct_pipeline(xtrain, feats, param)
    model_file = train_model(xtrain, train_pipeline)  # training the model

    dev_pipeline = construct_pipeline(xdev, feats, param)
    tested_dev = test_model(xdev, 'test', dev_pipeline,
                            model_file)  #testing the model with the dev ds

    test_pipeline = construct_pipeline(test, feats, param)
    tested_test = test_model(test, 'dev', test_pipeline,
                             model_file)  #testing the model with the test ds

    logging.info('evaluating the model using dev ds ...')
    evaluate_model(tested_dev,
                   param['classification'])  # evaluating the model on the dev
    logging.info('evaluating the model using test ds ...')
    evaluate_model(tested_test,
                   param['classification'])  #evaluating the model on the test
示例#3
0
def main(arguments):
    # param = parse_parameters() # get parameters from command
    display_params(arguments)

    datasets = [read_datsets(x, arguments['multi']) for x in arguments['input']] # loading datasets as lists of document objects
    features_list = [x for x in ['tfidf', 'char_grams', 'lexical', 'style', 'readability', 'nela'] if arguments[x]]

    maxabs_scaler = MaxAbsScaler()

    features_instance = features(datasets[0])

    for i in range(len(datasets)):
        X = compute_features(datasets[i], features_instance,
                             tfidf=arguments['tfidf'],
                             char_grams=arguments['char_grams'],
                             lexical=arguments['lexical'],
                             style=arguments['style'],
                             readability=arguments['readability'],
                             nela=arguments['nela']
                             )
        if i == 0:  # It is the first iteration and we assume this is training
            X = maxabs_scaler.fit_transform(X)
        else:
            X = maxabs_scaler.transform(X)

        dump_feature_file(X, get_output_file_name(arguments['input'][i], features_list) )
示例#4
0
def main(opts):
    list_sources_in_ds('../data/train.dist.converted.txt')
    now = datetime.datetime.now().strftime("%I:%M:%S on %p-%B-%d-%Y")
    logging.info("experiment started at " + now)
    param = parse_parameters(opts)  # get parameters from command
    selected_sources = param['sources'].split(',')
    prop_sources, nonprop_sources = list_sources_in_ds(param['train'])
    random_sources = nonprop_sources.keys()

    create_dataset(param['train'], selected_sources, random_sources,
                   param['new'], param['fix'])
    logging.info('a new training dataset created at :' + param['new'])

    new_train, dev, test = read_new_datsets(
        param)  # loading datsets as lists of document objects
    feats = features(
        new_train
    )  # creating an object from the class features to initialize important global variables such as lexicons and training ds

    train_pipeline = construct_pipeline(new_train, feats, param)
    model_file = train_model(new_train, train_pipeline)  # training the model
    logging.info('Training finished ')
    dev_pipeline = construct_pipeline(dev, feats, param)
    tested_dev = test_model(dev, 'dev', dev_pipeline,
                            model_file)  # testing the model with the dev ds

    test_pipeline = construct_pipeline(test, feats, param)
    tested_test = test_model(test, 'test', test_pipeline, model_file)

    logging.info('evaluating the model on dev ds ...')
    custom_evaluate(tested_dev, selected_sources)
    logging.info('evaluating the model on test ds ...')
    custom_evaluate(tested_test, selected_sources)
示例#5
0
def train_model(ds_file, param):
    logging.info('████████████████  𝕋 ℝ 𝔸 𝕀 ℕ 𝕀 ℕ 𝔾  ████████████████')
    train = load_myds(ds_file)
    feats = features(train)
    features_pipeline = construct_pipeline(
        train, feats, param
    )  # call the methods that extract features to initialize transformers

    model = LogisticRegression(
        penalty='l2', class_weight='balanced'
    )  # creating an object from the max entropy with L2 regulariation
    logging.info("Computing features")
    X = features_pipeline.transform(
        [doc.text for doc in train]
    )  # calling transform method of each transformer in the features pipeline to transform data into vectors of features
    pickle.dump(X, open("../data/model/transformed_train.pickle", "wb"))
    X = maxabs_scaler.fit_transform(X)
    Y = [doc.gold_label for doc in train]
    logging.info('fitting the model according to given data ...')
    model.fit(X, Y)
    now = datetime.datetime.now().strftime("%I:%M%S%p-%B-%d-%Y")
    model_file_name = '../data/model/' + now + 'maxentr_model.pkl'
    joblib.dump(model, model_file_name)  #pickle the model
    logging.info('model pickled at : ' + model_file_name)
    return model_file_name
def feat_dict(pos_feat, text):
    """
    Geeft het dictionary van alle features toegepast in een text.
    """
    dict = {}
    bigrams = ngrams(word_tokenize(text), 2)
    trigrams = ngrams(word_tokenize(text), 3)

    for feat in pos_feat:
        dict[feat] = features(feat, text, bigrams, [], [])
    return dict
示例#7
0
def pca(wvd, face):
    data = array(list(features(wvd, face)))[3:]
    zscore = lambda v: (v - v.mean()) / v.std()
    for i in xrange(data.shape[1]):
        data[:, i] = zscore(data[:, i])
    eval, evec = eig(cov(data.T))
    idx = argsort(eval)  # ascending
    proj = lambda i: dot(evec[:, i], data.T)
    return map(
        proj,
        reversed(idx))  # descending (first corresponds to largest eigenvalue)
示例#8
0
def formal_df(ts, df):
    timestamp = df['timestamp'].values
    label = df['label'].values
    data = []
    f = features()
    for i in range(1, len(label)):
        if i % 2000 == 0:
            print i * 100.0 / len(label), "%"
        tmp = f.get_features(ts, timestamp[i])
        tmp.append(label[i])
        data.append(tmp)
    return data
示例#9
0
def classify(wvd, face, minscore=1090, minlen=122):
    data = array(list(features(wvd, face)))
    for row in data:
        fid = row[1]
        wid = row[2]
        w = wvd[fid][wid]
        l = integrate_path_length(w)
        s = median_score(w)
        if l > minlen or s > minscore:
            row[0] = 1
        else:
            row[0] = 0
    return data
示例#10
0
def autotraj(wvd, face, data=None):
    """ 
  Uses kmeans to partition whisker segments into two sets: 
  
  class 1. high scoring and long
  class 2. low scoring and short

  The median number of class 1 segments in a frame is expected to correspond
  with the number of interesting whiskers; that is, the trajectories worth
  following.

  Following classification, a simple scheme is used to label segments in frames
  with the correct number of class 1 segments.

  If `data` is not provided, it will be computed from the whisker segments.
  The `data` table is an array with a row for each whisker segment consisting of
  a number of columns (3 + number of measurements).  The first column is a
  classification label, the second is the frame id, and the third is the 
  whisker id.  The `classification label` is overwritten here.

  Returns: traj,data

    'traj': a trajectories dictionary (see ui.whiskerdata.load_trajectories)
    'data': a table of shape measurements for each whisker segment

  Example:

    >>> import summary
    >>> w,movie = summary.load('data/my_movie.seq', 'data/my_movie[heal].whiskers')
    >>> traj,data = summary.autotraj(w, face='left')
    >>> summary.plot_summary_data(w,traj,data)

  """
    if data == None:
        data = array(list(features(wvd, face)))
    traj = _simpletraj(data, face)
    return traj, data
示例#11
0
def main(algorithm):
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the column
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)
    # print("matrix ->",s, "p_k2 ->",p_k2, "p_k1 ->", p_k1, "p_q1 ->",p_q1)
    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a
    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    # print("1 = locations that the king can move to ->",dfK1,"|", "a_k1: a 8x1 vector specifying the allowed actions for the King ->", a_k1)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)

    n_input_layer = 50  # Number of neurons of the input layer. Moves enemy king can make, checked
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = 32  # Number of neurons of the output layer.

    W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer))
    W1 = np.divide(W1,
                   np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer))

    W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer))
    W2 = np.divide(W2,
                   np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer))
    # print(W1, W2)
    bias_W1 = np.ones((n_hidden_layer, ))
    bias_W2 = np.ones((n_output_layer, ))

    # Network Parameters
    epsilon_0 = 0.2  #epsilon for the e-greedy policy
    beta = 0.00005  #epsilon discount factor
    gamma = 0.85  #SARSA Learning discount factor
    eta = 0.0035  #learning rate
    N_episodes = 100000  #Number of games, each game ends when we have a checkmate or a draw
    alpha = 1 / 10000
    if algorithm == "sarsa":
        sarsa = 1
        qlearning = 0
    else:
        sarsa = 0
        qlearning = 1
    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1],
                    [-1, -1]])

    R_save = np.zeros([N_episodes, 1])
    N_moves_save = np.zeros([N_episodes, 1])

    if_Q_next = 0

    for n in range(N_episodes):
        epsilon_f = epsilon_0 / (
            1 + beta * n
        )  #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements
        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        while checkmate == 0 and draw == 0:

            # Player 1

            # Actions & allowed_actions
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)

            if np.unique(Q).size == 1 or (int(np.random.rand() < epsilon_f)):
                a_agent = random.choice(allowed_a)
            else:
                Q2 = Q
                done = 0
                while done == 0:
                    move = Q2.argmax()
                    if move in allowed_a:
                        a_agent = move
                        done = 1
                    else:
                        Q2[move] = 0

            picked_action = [0] * 32
            picked_action[a_agent] = 1

            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]

            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)

            # Player 2
            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward is 1 when checkmate

                if if_Q_next == True:
                    x = x.reshape(1, -1)
                    out1 = out1.reshape(1, -1)
                    Q = Q.reshape(1, -1)

                    if sarsa == False:
                        target = R + gamma * max(Q_next)
                    else:
                        target = R

                    di = (target - Q) * picked_action
                    dj = np.dot(di, W2)

                    W1 += (eta * np.dot(x.T, np.dot(di, W2))).T
                    W2 += (eta * np.dot(out1.T, di)).T
                    bias_W1 += eta * np.dot(di, W2)[0]
                    bias_W2 += eta * di[0]

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1

                if if_Q_next == True:
                    x = x.reshape(1, -1)
                    out1 = out1.reshape(1, -1)
                    Q = Q.reshape(1, -1)
                    if sarsa == False:
                        target = R + gamma * max(Q_next)
                    else:
                        target = R

                    di = (target - Q) * picked_action
                    dj = np.dot(di, W2)

                    W1 += (eta * np.dot(x.T, np.dot(di, W2))).T
                    W2 += (eta * np.dot(out1.T, di)).T
                    bias_W1 += eta * np.dot(di, W2)[0]
                    bias_W2 += eta * di[0]

                if draw:
                    break
            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(
                    np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)

            if_Q_next = True

            if not check or draw:
                x = x.reshape(1, -1)
                out1 = out1.reshape(1, -1)
                Q = Q.reshape(1, -1)
                if sarsa == False:
                    target = R + gamma * max(Q_next)
                else:
                    target = R + gamma * Q_next

                di = (target - Q) * picked_action
                dj = np.dot(di, W2)

                W1 += (eta * np.dot(x.T, np.dot(di, W2))).T
                W2 += (eta * np.dot(out1.T, di)).T
                bias_W1 += eta * np.dot(di, W2)[0]
                bias_W2 += eta * di[0]

            i += 1
        R_save[n, :] = ((1 - alpha) * R_save[n - 1, :]) + (alpha * R)
        N_moves_save[n, :] = (
            (1 - alpha) * N_moves_save[n - 1, :]) + (alpha * i)
    return N_moves_save, R_save
示例#12
0
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)
    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a
    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)
    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = 50  # Number of neurons of the input layer. TODO: Change this value
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = 32  # Number of neurons of the output layer. TODO: Change this value accordingly
    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """
    # Weights matrix, connecting input neurons (state) to hidden layers (actions). Initially random

    # W1 is defined and resclaed by the totla number of connections between the consdiered two layers
    W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer))
    W1 = np.divide(W1,
                   np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer))

    # W1 is defined and resclaed by the totla number of connections between the consdiered two layers
    W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer))
    W2 = np.divide(W2,
                   np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer))

    # bias is set to zero
    bias_W1 = np.zeros((n_hidden_layer, ))
    bias_W2 = np.zeros((n_output_layer, ))

    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2  #epsilon for the e-greedy policy
    beta = 0.00005  # 0.005     #epsilon discount factor
    gamma = 0.85  #0.15      #SARSA Learning discount factor
    eta = 0.0035  #0.0035      #learning rate
    N_episodes = 1000  #Number of games, each game ends when we have a checkmate or a draw
    rmsprop = False
    ###  Training Loop  ###
    #varialbe setting for RMSprop calculation
    W2_average = 0
    W1_average = 0
    W2_bias_average = 0
    W1_bias_average = 0
    eta_w2 = 0
    eta_w1 = 0
    eta_bias1 = 0
    eta_bias2 = 0

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1],
                    [-1, -1]])

    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.

    # R_save = np.zeros([N_episodes, 1])
    #N_moves_save = np.zeros([N_episodes, 1])
    R_save = np.zeros([N_episodes])
    N_moves_save = np.zeros([N_episodes])
    alpha = 0.0001  # for exponential moving average
    # END OF SUGGESTIONS

    for n in range(N_episodes):
        epsilon_f = epsilon_0 / (
            1 + beta * n
        )  #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements

        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        moves_game = 0  # to store moves per game

        while checkmate == 0 and draw == 0:
            R = 0  # Reward

            # Player 1

            # Actions & allowed_actions
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested.
            Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)
            """
            YOUR CODE STARTS HERE
            
            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
            be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
            containing all the possible actions. Create a vector called a_agent that contains the index of the action 
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
            """

            #eps-greedy policy implementation
            greedy = (np.random.rand() > epsilon_f)
            if greedy:
                #a_agent = allowed_a[np.take(Q, allowed_a).tolist().index(max(np.take(Q, allowed_a).tolist()))]
                a_agent = allowed_a[np.argmax(np.take(
                    Q, allowed_a))]  #pick the best action
            else:
                a_agent = np.random.choice(allowed_a)  #pick random action

            #THE CODE ENDS HERE.

            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]
                # One more move is made from player 1
                moves_game += 1
            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]
                # One more move is made from player 1
                moves_game += 1
            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward for checkmate
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, the agent gave checkmate.
                """

                #Backpropagation
                # calcuating delta and update weight and bias for W2
                # if statements indicates the heaviside function
                out1delta = np.dtype(np.complex128)
                #Backpropagation for the output layer
                if ((np.dot(W2[a_agent], out1)) > 0):
                    out2delta = (R - Q[a_agent])
                    W2[a_agent] += (eta * out2delta * out1)
                    bias_W2 += (eta * out2delta)
                    #calculating backpropagationg for hidden layer
                    if (np.sum(np.dot(W1, x)) > 0):
                        out1delta = np.dot(W2[a_agent], out2delta)
                        W1 += (eta * np.outer(out1delta, x))
                        bias_W1 += (eta * out1delta)

                # It is checkmate, plot rewards and moves per game (exponential moving average), alpha = 0.0001
                if n > 0:
                    R_save[n] = ((1 - alpha) * R_save[n - 1]) + (alpha * R)
                else:
                    R_save[n] = R

                if n > 0:
                    N_moves_save[n] = ((1 - alpha) * N_moves_save[n - 1]) + (
                        alpha * moves_game)
                else:
                    N_moves_save[n] = moves_game

                # THE CODE ENDS HERE
                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, it is a draw.
                """

                out1delta = np.dtype(np.complex128)
                #Backpropagation, same as above
                if ((np.dot(W2[a_agent], out1)) > 0):
                    out2delta = (R - Q[a_agent])
                    W2[a_agent] += (eta * out2delta * out1)
                    bias_W2 += (eta * out2delta)
                    if (np.sum(np.dot(W1, x)) > 0):
                        out1delta = np.dot(W2[a_agent], out2delta)
                        W1 += (eta * np.outer(out1delta, x))
                        bias_W1 += (eta * out1delta)

                # It is draw, plot rewards and moves per game (exponential moving average), alpha = 0.0001
                if n > 0:
                    R_save[n] = ((1 - alpha) * R_save[n - 1]) + (alpha * R)
                else:
                    R_save[n] = R

                if n > 0:
                    N_moves_save[n] = ((1 - alpha) * N_moves_save[n - 1]) + (
                        alpha * moves_game)
                else:
                    N_moves_save[n] = moves_game

                # YOUR CODE ENDS HERE
                if draw:
                    break

            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(
                    np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]
                # one more move
                moves_game += 1

            # Update the parameters

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)
            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
            the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
            iteration of the episode, the match continues.
            """
            # error cost function
            # error = 0.5*((R- (gamma*np.max(Q_next))-Q[a_agent])**2)
            # print(error)

            # FOR SARSA - Reapply epsilong greedy policy for Q_next
            # a_new = np.concatenate([np.array(a_q1), np.array(a_k1)])
            # allowed_a = np.where(a_new > 0)[0]

            ##eps-greedy policy implementation
            # greedy = (np.random.rand() > epsilon_f)
            # if greedy:
            #     #a_agent = allowed_a[np.take(Q, allowed_a).tolist().index(max(np.take(Q, allowed_a).tolist()))]
            #     next_agent = allowed_a[np.argmax(np.take(Q, allowed_a))]  #pick the best action
            # else:
            #     next_agent = np.random.choice(allowed_a)                  #pick random action

            # out1delta = np.dtype(np.complex128) # to preven to overflowing issues
            # #Backpropagation, same as above but this time the next Q-value is considered.
            # #RMSprop is activated when it is set to True
            # if ((np.dot(W2[a_agent], out1)) > 0):
            #     out2delta = (R - Q[a_agent] + gamma * np.max(Q_next))
            #     #FOR SARSA
            #     # out2delta = (R - Q[a_agent] + gamma * Q_next(next_agent))
            #     out1delta = np.dot(W2[a_agent], out2delta)
            #     W1_d = np.outer(x, out1delta)
            #     W2_d = np.outer(out1, out2delta)
            #     if rmsprop:
            #         alpha_rms = 0.9 #  a recmommend value
            #         # The calculation of RMSProp
            #         W2_average = (alpha_rms * W2_average) +(1.0 - alpha_rms) * (W2_d)**2
            #         W1_average = (alpha_rms * W1_average) +(1.0 - alpha_rms) * (W1_d)**2
            #         W2_bias_average = (alpha_rms * W2_bias_average) +(1.0 - alpha_rms) * (out2delta)**2
            #         W1_bias_average = (alpha_rms * W1_bias_average) +(1.0 - alpha_rms) * (out1delta)**2
            #         # applying different learning rates
            #         eta_w2 = eta/ np.sqrt(W2_average[a_agent])
            #         eta_w1 = eta / np.sqrt(W1_average)
            #         eta_bias2 = eta/ W2_bias_average
            #         eta_bias1 = eta/ W1_bias_average
            #     W2[a_agent] += (eta_w2 * out2delta * out1)
            #     bias_W2 += (eta_bias2 * out2delta)
            #     #backpropagation for the hidden layer
            #     if (np.sum(np.dot(W1, x)) > 0):
            #         # W1 += np.outer(x, out1delta).T * eta_w1.T
            #         # bias_W1 += (eta_bias1 * out1delta)
            # else: # without rmsprop, just normal backpropagation as before is applied
            out1delta = np.dtype(np.complex128)
            if ((np.dot(W2[a_agent], out1)) > 0):
                out2delta = (R - Q[a_agent] + gamma * np.max(Q_next))
                W2[a_agent] += (eta * out2delta * out1)
                bias_W2 += (eta * out2delta)
                if (np.sum(np.dot(W1, x)) > 0):
                    out1delta = np.dot(W2[a_agent], out2delta)
                    W1 += (eta * np.outer(out1delta, x))
                    bias_W1 += (eta * out1delta)

            # match continues, so one more move
            moves_game += 1

            # YOUR CODE ENDS HERE
            i += 1

    fontSize = 12

    plt.plot(R_save)
    plt.xlabel('Nth Game', fontsize=fontSize)
    plt.ylabel('Rewards per Game (Defalut)', fontsize=fontSize)
    plt.show()

    plt.plot(N_moves_save)
    plt.xlabel('Nth Game', fontsize=fontSize)
    plt.ylabel('moves per Game (Defalut)', fontsize=fontSize)
    plt.show()
示例#13
0
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)
    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a
    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)
    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layefr and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    #change this to 0 for only q learning or 1 for q learning and sarsa
    sarsa = 0
    n_input_layer = 3 * (
        size_board * size_board
    ) + 2  # Number of neurons of the input layer. TODO: Change this value
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = N_a  # Number of neurons of the output layer. TODO: Change this value accordingly
    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """
    import numpy.matlib

    # weights between input layer and hidden layer
    W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer))
    W1 = np.divide(W1,
                   np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer))
    # weights between hidden layer and output layer
    W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer))
    W2 = np.divide(W2,
                   np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer))

    # bias for hidden layer
    bias_W1 = np.zeros(n_hidden_layer, )
    bias_W1 = bias_W1.reshape(n_hidden_layer, 1)
    # bisa for output layer
    bias_W2 = np.zeros(n_output_layer, )
    bias_W2 = bias_W2.reshape(n_output_layer, 1)

    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2  #epsilon for the e-greedy policy
    beta = 0.00005  #epsilon discount factor
    gamma = 0.85  #SARSA Learning discount factor
    eta = 0.0035  #learning rate
    N_episodes = 100000  #Number of games, each game ends when we have a checkmate or a draw

    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1],
                    [-1, -1]])

    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.

    R_save = np.zeros([N_episodes, 1])
    N_moves_save = np.zeros([N_episodes, 1])
    R_save_sarsa = np.zeros([N_episodes, 1])
    N_moves_save_sarsa = np.zeros([N_episodes, 1])
    if sarsa:
        runs = 2
    else:
        runs = 1

    # END OF SUGGESTIONS
    # loop needed to produce figures comparing the two methods
    for run in range(runs):
        if sarsa and (run == 0):
            sarsa = 1
        else:
            # must reset weights and biases to run again for different method

            # weights between input layer and hidden layer
            W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer))
            W1 = np.divide(
                W1, np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer))
            # weights between hidden layer and output layer
            W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer))
            W2 = np.divide(
                W2,
                np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer))

            # bias for hidden layer
            bias_W1 = np.zeros(n_hidden_layer, )
            bias_W1 = bias_W1.reshape(n_hidden_layer, 1)
            # bisa for output layer
            bias_W2 = np.zeros(n_output_layer, )
            bias_W2 = bias_W2.reshape(n_output_layer, 1)
            sarsa = 0
        for n in range(N_episodes):
            epsilon_f = epsilon_0 / (
                1 + beta * n
            )  #epsilon is discounting per iteration to have less probability to explore
            checkmate = 0  # 0 = not a checkmate, 1 = checkmate
            draw = 0  # 0 = not a draw, 1 = draw
            i = 1  # counter for movements

            # Generate a new game
            s, p_k2, p_k1, p_q1 = generate_game(size_board)

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)

            while checkmate == 0 and draw == 0:
                R = 0  # Reward

                # Player 1

                # Actions & allowed_actions
                a = np.concatenate([np.array(a_q1), np.array(a_k1)])
                allowed_a = np.where(a > 0)[0]

                # Computing Features
                x = features(p_q1, p_k1, p_k2, dfK2, s, check)

                # FILL THE CODE
                # Enter inside the Q_values function and fill it with your code.
                # You need to compute the Q values as output of your neural
                # network. You can change the input of the function by adding other
                # data, but the input of the function is suggested.

                Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)
                """
                YOUR CODE STARTS HERE
                
                FILL THE CODE
                Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
                be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
                containing all the possible actions. Create a vector called a_agent that contains the index of the action 
                chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
                """

                a_agent = 1  # CHANGE THIS VALUE BASED ON YOUR CODE TO USE EPSILON GREEDY POLICY

                eGreedy = int(np.random.rand() < epsilon_f)
                # if egreedy then random, else use optimal move
                if eGreedy:
                    index = np.random.randint(len(allowed_a))
                    a_agent = allowed_a[index]
                else:
                    # get highest q value for an action which is allowed
                    opt_action = max([Q[j] for j in allowed_a])
                    a_agent = np.where(Q == opt_action)[0][0]

                #THE CODE ENDS HERE.

                # Player 1 makes the action
                if a_agent < possible_queen_a:
                    direction = int(np.ceil(
                        (a_agent + 1) / (size_board - 1))) - 1
                    steps = a_agent - direction * (size_board - 1) + 1

                    s[p_q1[0], p_q1[1]] = 0
                    mov = map[direction, :] * steps
                    s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                    p_q1[0] = p_q1[0] + mov[0]
                    p_q1[1] = p_q1[1] + mov[1]

                else:
                    direction = a_agent - possible_queen_a
                    steps = 1

                    s[p_k1[0], p_k1[1]] = 0
                    mov = map[direction, :] * steps
                    s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                    p_k1[0] = p_k1[0] + mov[0]
                    p_k1[1] = p_k1[1] + mov[1]

                # Compute the allowed actions for the new position

                # Possible actions of the King
                dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
                # Possible actions of the Queen
                dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
                # Possible actions of the enemy king
                dfK2, a_k2, check = degree_freedom_king2(
                    dfK1, p_k2, dfQ1_, s, p_k1)

                # Player 2

                # Check for draw or checkmate
                if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                    # King 2 has no freedom and it is checked
                    # Checkmate and collect reward
                    checkmate = 1
                    R = 1  # Reward for checkmate
                    """
                    FILL THE CODE
                    Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                    rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                    the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                    iteration of the episode, the agent gave checkmate.
                    """

                    # target reward
                    target = R
                    # Backpropagation: output layer -> hidden layer

                    # rectified output
                    rectOutput = np.zeros((n_output_layer, 1))
                    rectOutput[a_agent, 0] = 1
                    # update q-value
                    Qdelta = (target - Q) * rectOutput
                    # update weights
                    W2 = W2 + (eta * np.outer(Qdelta, out1))
                    bias_W2 = bias_W2 + (eta * Qdelta)

                    # Backpropagation: hidden -> input layer
                    #rectified output
                    rectOutput2 = np.zeros((n_hidden_layer, 1))
                    for j in range(0, len(out1)):
                        rectOutput2[int(out1[j][0]), 0] = 1
                    #update q value
                    out1delta = np.dot(W2.T, Qdelta) * rectOutput2
                    #update weights
                    W1 = W1 + (eta * np.outer(out1delta, x))
                    bias_W1 = bias_W1 + (eta * out1delta)

                    # THE CODE ENDS HERE

                    if checkmate:
                        break

                elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                    # King 2 has no freedom but it is not checked
                    draw = 1
                    R = 0.1
                    """
                    FILL THE CODE
                    Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                    rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                    the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                    iteration of the episode, it is a draw.
                    """

                    target = R
                    # Backpropagation: output layer -> hidden layer
                    #rectified output
                    rectOutput = np.zeros((n_output_layer, 1))
                    rectOutput[a_agent, 0] = 1
                    #update q
                    Qdelta = (target - Q) * rectOutput
                    #update weights and biases
                    W2 = W2 + (eta * np.outer(Qdelta, out1))
                    bias_W2 = bias_W2 + (eta * Qdelta)

                    # Backpropagation: hidden -> input layer
                    # rectified output
                    rectOutput2 = np.zeros((n_hidden_layer, 1))
                    for j in range(0, len(out1)):
                        rectOutput2[int(out1[j][0]), 0] = 1
                    #update q
                    out1delta = np.dot(W2.T, Qdelta) * rectOutput2
                    # update weights and biases
                    W1 = W1 + (eta * np.outer(out1delta, x))
                    bias_W1 = bias_W1 + (eta * out1delta)

                    # YOUR CODE ENDS HERE

                    if draw:
                        break

                else:
                    # Move enemy King randomly to a safe location
                    allowed_enemy_a = np.where(a_k2 > 0)[0]
                    a_help = int(
                        np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) -
                        1)
                    a_enemy = allowed_enemy_a[a_help]

                    direction = a_enemy
                    steps = 1

                    s[p_k2[0], p_k2[1]] = 0
                    mov = map[direction, :] * steps
                    s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                    p_k2[0] = p_k2[0] + mov[0]
                    p_k2[1] = p_k2[1] + mov[1]

                # Update the parameters

                # Possible actions of the King
                dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
                # Possible actions of the Queen
                dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
                # Possible actions of the enemy king
                dfK2, a_k2, check = degree_freedom_king2(
                    dfK1, p_k2, dfQ1_, s, p_k1)
                # Compute features
                x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
                # Compute Q-values for the discounted factor
                Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
                iteration of the episode, the match continues.
                """
                # if statement for next action using sarsa and q learning
                if sarsa:
                    eGreedy = int(np.random.rand() < epsilon_f)
                    #if egreedy then random, else use optimal move
                    if eGreedy:
                        index = np.random.randint(len(allowed_a))
                        a_agent = allowed_a[index]
                    else:
                        # get highest q value for an action which is allowed
                        opt_action = max([Q[j] for j in allowed_a])
                        a_agent = np.where(Q == opt_action)[0][0]
                    # target according to sarsa
                    target = R + (gamma * (Q_next[a_agent]))
                    # rectified output for specified action
                    rectOutput = np.zeros((n_output_layer, 1))
                    rectOutput[a_agent, 0] = 1
                else:
                    target = R + (gamma * max(Q_next))
                    rectOutput = np.zeros((n_output_layer, 1))
                    rectOutput[a_agent, 0] = 1

                # Backpropagation: output layer -> hidden layer
                # update q
                Qdelta = (target - Q) * rectOutput
                # update weights and biases
                W2 = W2 + (eta * np.outer(Qdelta, out1))
                bias_W2 = bias_W2 + (eta * Qdelta)

                # Backpropagation: hidden -> input layer
                #rectified output
                rectOutput2 = np.zeros((n_hidden_layer, 1))
                for j in range(0, len(out1)):
                    rectOutput2[int(out1[j][0]), 0] = 1

                #update q
                out1delta = np.dot(W2.T, Qdelta) * rectOutput2
                #update weights and biases
                W1 = W1 + (eta * np.outer(out1delta, x))
                bias_W1 = bias_W1 + (eta * out1delta)

                # YOUR CODE ENDS HERE
                i += 1
            # code for exponential moving average
            alpha = 1 / 10000
            if sarsa:
                R_save_sarsa[n, :] = (
                    (1 - alpha) * R_save_sarsa[n - 1, :]) + (alpha * R)
                N_moves_save_sarsa[n, :] = (
                    (1 - alpha) * N_moves_save_sarsa[n - 1, :]) + (alpha * i)
            else:
                R_save[n, :] = ((1 - alpha) * R_save[n - 1, :]) + (alpha * R)
                N_moves_save[n, :] = (
                    (1 - alpha) * N_moves_save[n - 1, :]) + (alpha * i)

    # plot
    plt.subplot(211)
    plt.xlabel('number of games')
    plt.ylabel('EMA of reward')
    plt.title('Q-learning reward')
    plt.locator_params(axis='y', nbins=10, tight=True)
    plt.plot(R_save)
    if R_save_sarsa[0]:
        plt.plot(R_save_sarsa, color='red')

    plt.subplot(212)
    plt.xlabel('number of games')
    plt.ylabel('EMA of moves')
    plt.title('Q-learning moves')
    plt.locator_params(axis='y', nbins=10, tight=True)
    plt.plot(N_moves_save)
    if N_moves_save_sarsa[0]:
        plt.plot(N_moves_save_sarsa, color='red')

    plt.tight_layout()
    plt.savefig('figure.png')
    plt.show()
示例#14
0
def train():
    if not os.path.isfile(train_data_pickle):
        # trainig data
        train_features, train_labels = features(['fold0', 'fold1', 'fold2'])
        traindata = TrainData(train_features, train_labels)
        with open(train_data_pickle, mode='wb') as f:
            pickle.dump(traindata, f)
    else:
        print("loading: %s" % (train_data_pickle))
        with open(train_data_pickle, mode='rb') as f:
            traindata = pickle.load(f)
            train_features = traindata.train_inputs
            train_labels = traindata.train_targets

    if not os.path.isfile(test_data_pickle):
        test_features, test_labels = features(['fold3'])
        testdata = TestData(test_features, test_labels)
        with open(test_data_pickle, mode='wb') as f:
            pickle.dump(testdata, f)
    else:
        print("loading: %s" % (test_data_pickle))
        with open(test_data_pickle, mode='rb') as f:
            testdata = pickle.load(f)
            test_features = testdata.test_inputs
            test_labels = testdata.test_targets

    # TODO change to use train and test
    train_labels = one_hot_encode(train_labels)
    test_labels = one_hot_encode(test_labels)

    # random train and test sets.
    train_test_split = np.random.rand(len(train_features)) < 0.70
    train_x = train_features[train_test_split]
    train_y = train_labels[train_test_split]
    test_x = train_features[~train_test_split]
    test_y = train_labels[~train_test_split]

    n_dim = train_features.shape[1]
    print("input dim: %s" % (n_dim))

    # create placeholder
    X = tf.placeholder(tf.float32, [None, n_dim])
    Y = tf.placeholder(tf.float32, [None, FLAGS.num_classes])
    # build graph
    logits = model.inference(X, n_dim)

    weights = tf.all_variables()
    saver = tf.train.Saver(weights)

    # create loss
    loss = model.loss(logits, Y)
    tf.scalar_summary('loss', loss)

    accracy = model.accuracy(logits, Y)
    tf.scalar_summary('test accuracy', accracy)

    # train operation
    train_op = model.train_op(loss)

    # variable initializer
    init = tf.initialize_all_variables()

    # get Session
    sess = tf.Session()

    # sumary merge and writer
    merged = tf.merge_all_summaries()
    train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir)

    # initialize
    sess.run(init)

    for step in xrange(MAX_STEPS):

        t_pred = sess.run(tf.argmax(logits, 1), feed_dict={X: train_features})
        t_true = sess.run(tf.argmax(train_labels, 1))
        print("train samples pred: %s" % t_pred[:30])
        print("train samples target: %s" % t_true[:30])
        print('Train accuracy: ',
              sess.run(accracy, feed_dict={
                  X: train_x,
                  Y: train_y
              }))
        for epoch in xrange(training_epochs):
            summary, logits_val, _, loss_val = sess.run(
                [merged, logits, train_op, loss],
                feed_dict={
                    X: train_x,
                    Y: train_y
                })
        train_writer.add_summary(summary, step)

        print("step:%d, loss: %s" % (step, loss_val))
        y_pred = sess.run(tf.argmax(logits, 1), feed_dict={X: test_x})
        y_true = sess.run(tf.argmax(test_y, 1))
        print("test samples pred: %s" % y_pred[:10])
        print("test samples target: %s" % y_true[:10])
        accracy_val = sess.run([accracy], feed_dict={X: test_x, Y: test_y})
        # print('Test accuracy: ', accracy_val)
        # train_writer.add_summary(accracy_val, step)
        p, r, f, s = precision_recall_fscore_support(y_true,
                                                     y_pred,
                                                     average='micro')
        print("F-score: %s" % f)

        if step % 1000 == 0:
            saver.save(sess, FLAGS.ckpt_dir, global_step=step)
示例#15
0
def main(N_episodes, type=None, gamma=0.85, beta=0.00005, seed=None):
    numpy.random.seed(seed) if seed else None
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)
    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a
    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)
    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = x.shape[
        0]  # Number of neurons of the input layer. TODO: Change this value
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = N_a  # Number of neurons of the output layer. TODO: Change this value accordingly
    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """

    W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer))
    W1 = np.divide(W1,
                   np.matlib.repmat(np.sum(W1, 1)[:, None], 1, n_input_layer))

    W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer))
    W2 = np.divide(W2,
                   np.matlib.repmat(np.sum(W2, 1)[:, None], 1, n_hidden_layer))

    bias_W1 = np.ones((n_hidden_layer, ))
    bias_W2 = np.ones((n_output_layer, ))

    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2  #epsilon for the e-greedy policy
    # beta = 0.00005    #epsilon discount factor
    # gamma = 0.85      #SARSA Learning discount factor
    eta = 0.0035  #learning rate
    # N_episodes = 100  #Number of games, each game ends when we have a checkmate or a draw

    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1],
                    [-1, -1]])

    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.

    R_save = np.zeros([N_episodes])
    N_moves_save = np.zeros([N_episodes])

    R_save_exp = np.zeros([N_episodes])
    N_moves_save_exp = np.zeros([N_episodes])

    error = np.zeros([N_episodes])
    errors = np.zeros([N_episodes])
    errors_E = np.zeros([N_episodes])

    win = False

    # END OF SUGGESTIONS

    for n in range(N_episodes):
        epsilon_f = epsilon_0 / (
            1 + beta * n
        )  #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        alpha = 1 / 10000
        i = 1  # counter for movements
        # print(n)
        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        print(n)

        while checkmate == 0 and draw == 0:
            # print(i)
            R = 0  # Reward

            # Player 1

            # Actions & allowed_actions
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested.
            Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)
            """
            YOUR CODE STARTS HERE
            
            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
            be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
            containing all the possible actions. Create a vector calle da_agent that contains the index of the action 
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
            """

            allowed_q = Q[allowed_a]
            # print(np.random.randint(allowed_a.shape[0]))
            a_agent = allowed_a[np.argmax(allowed_q)] if not (
                np.random.rand() < epsilon_f) else allowed_a[np.random.randint(
                    allowed_a.shape[0])]
            # a_agent = 0
            # if np.random.rand() > epsilon_0:
            #     a_agent = allowed_a[np.argmax(allowed_q)]
            # else:
            #     a_agent = allowed_a[np.random.randint(allowed_a.shape[0])]
            #     print(1)

            # CHANGE THIS VALUE BASED ON YOUR CODE TO USE EPSILON GREEDY POLICY

            #THE CODE ENDS HERE.

            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]

            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward for checkmate
                win = True
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, the agent gave checkmate.
                """

                # Backpropagation: output layer -> hidden layer
                # Backpropagation: output layer -> hidden layer
                out2delta = (R - Q[a_agent]) * np.heaviside(Q, 0)
                W2[a_agent] += (eta * np.outer(out2delta, out1))[a_agent]
                bias_W2[a_agent] += (eta * out2delta)[a_agent]

                # Backpropagation: hidden layer -> input layer
                out1delta = np.dot(out2delta, W2).dot(np.heaviside(out1, 0))
                W1 += eta * np.outer(out1delta, x)
                bias_W1 += eta * out1delta

                errors_E[n] = errors_E[n] / i + (
                    (1 - alpha) * errors_E[n - 1] + alpha *
                    (R - Q[a_agent])**2) / i if n > 0 else (R - Q[a_agent])**2
                errors[n] = errors[n] / i + (
                    ((R - Q[a_agent])**2 + n * errors[n - 1]) /
                    (n + 1)) / i if n > 0 else (R - Q[a_agent])**2
                error[n] = error[n] / i + ((R - Q[a_agent]) *
                                           (R - Q[a_agent])) / i
                # THE CODE ENDS HERE

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1

                win = False
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, it is a draw.
                """

                # Backpropagation: output layer -> hidden layer
                # Backpropagation: output layer -> hidden layer
                out2delta = (R - Q[a_agent]) * np.heaviside(Q, 0)
                W2[a_agent] += (eta * np.outer(out2delta, out1))[a_agent]
                bias_W2[a_agent] += (eta * out2delta)[a_agent]

                # Backpropagation: hidden layer -> input layer
                out1delta = np.dot(out2delta, W2).dot(np.heaviside(out1, 0))
                W1 += eta * np.outer(out1delta, x)
                bias_W1 += eta * out1delta

                errors_E[n] = errors_E[n] / i + (
                    (1 - alpha) * errors_E[n - 1] + alpha *
                    (R - Q[a_agent])**2) / i if n > 0 else (R - Q[a_agent])**2
                errors[n] = errors[n] / i + (
                    ((R - Q[a_agent])**2 + n * errors[n - 1]) /
                    (n + 1)) / i if n > 0 else (R - Q[a_agent])**2
                error[n] = error[n] / i + ((R - Q[a_agent]) *
                                           (R - Q[a_agent])) / i
                # YOUR CODE ENDS HERE

                if draw:
                    break

            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(
                    np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]

            # Update the parameters
            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)
            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
            the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
            iteration of the episode, the match continues.
            """

            a_new = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a_new = np.where(a_new > 0)[0]

            allowed_q_new = Q_next[allowed_a_new]
            # print(np.random.randint(allowed_a.shape[0]))
            # a_agent = allowed_a_new[np.argmax(allowed_q_new)]

            # If the agent is using SARSA, then use the Epsilon greedy policy else use max
            if type == "SARSA":
                a_agent = allowed_a_new[np.argmax(allowed_q_new)] if not (
                    np.random.rand() < epsilon_f) else allowed_a_new[
                        np.random.randint(allowed_a_new.shape[0])]
                t = R + gamma * Q_next[a_agent]
            else:
                t = R + gamma * np.max(allowed_q_new)

            # Backpropagation: output layer -> hidden layer
            out2delta = (t - Q[a_agent]) * np.heaviside(Q, 0)
            W2[a_agent] += (eta * np.outer(out2delta, out1))[a_agent]
            bias_W2[a_agent] += (eta * out2delta)[a_agent]

            # Backpropagation: hidden layer -> input layer
            out1delta = np.dot(out2delta, W2).dot(np.heaviside(out1, 0))
            W1 += eta * np.outer(out1delta, x)
            bias_W1 += eta * out1delta

            errors_E[n] += (1 - alpha) * errors_E[n - 1] + alpha * (
                t - Q[a_agent])**2 if n > 0 else (t - Q[a_agent])**2
            errors[n] += ((t - Q[a_agent])**2 + n * errors[n - 1]) / (
                n + 1) if n > 0 else (t - Q[a_agent])**2
            error[n] += (t - Q[a_agent]) * (t - Q[a_agent])
            # YOUR CODE ENDS HERE
            i += 1

        # Save the number of moves and Reward averages
        R_save[n] = (R + n * R_save[n - 1]) / (n + 1) if n > 0 else R
        N_moves_save[n] = (i + n * N_moves_save[n - 1]) / (n +
                                                           1) if n > 0 else i

        R_save_exp[n] = (1 - alpha) * R_save_exp[n -
                                                 1] + alpha * R if n > 0 else R
        N_moves_save_exp[n] = (
            1 - alpha) * N_moves_save_exp[n - 1] + alpha * i if n > 0 else i

        # Save result
        results = dict()
        results["Reward_SMA"] = R_save[n]
        results["Moves_SMA"] = N_moves_save[n]
        results["Reward_EMA"] = R_save_exp[n]
        results["Moves_EMA"] = N_moves_save_exp[n]
        results["Loss"] = error[n]
        results["Loss_SMA"] = errors[n]
        results["Loss_EMA"] = errors_E[n]
        results["outcome"] = win

        # Save data as a row in a csv file named accordnig to experiement
        if type == "gamma":
            out_root = "Results/" + type + "-" + str(gamma) + "results.csv"
        elif type == "beta":
            out_root = "Results/" + type + "-" + str(beta) + "results.csv"
        elif type == "SARSA":
            out_root = "Results/" + type + "results.csv"
        else:
            out_root = "Results/results.csv"
        file_exists = os.path.isfile(out_root)
        with open(out_root, "a+") as f:
            fieldnames = [
                'Reward_SMA', 'Moves_SMA', 'Reward_EMA', 'Moves_EMA', "Loss",
                "Loss_SMA", "Loss_EMA", 'outcome'
            ]
            w = csv.DictWriter(f, fieldnames=fieldnames)
            if not file_exists:
                w.writeheader()  # file doesn't exist yet, write a header
            w.writerow(results)
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)
    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left,
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left,
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a
    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1):
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)
    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights
    should be initialised according to a uniform distribution and rescaled by the total number of connections between
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = 52  # Number of neurons of the input layer. TODO: Change this value
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = 32  # Number of neurons of the output layer. TODO: Change this value accordingly
    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the
    output layer according to the instructions. Define also the biases.
    """

    W1 = np.random.normal(scale=0.1, size=(n_input_layer, n_hidden_layer))
    W2 = np.random.normal(scale=0.1, size=(n_hidden_layer, n_output_layer))

    bias_W1 = np.zeros((1, n_hidden_layer))
    bias_W2 = np.zeros((1, n_output_layer))
    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2  #epsilon for the e-greedy policy
    beta = 0.00005  #epsilon discount factor
    gamma = 0.85  #SARSA Learning discount factor
    eta = 0.0035  #learning rate
    N_episodes = 100000  #Number of games, each game ends when we have a checkmate or a draw

    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1],
                    [-1, -1]])

    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.

    R_save = np.zeros([N_episodes, 1])
    N_moves_save = np.zeros([N_episodes, 1])

    # END OF SUGGESTIONS

    for n in range(N_episodes):
        #print(n,W1,"W2",W2)

        epsilon_f = epsilon_0 / (
            1 + beta * n
        )  #psilon is discounting per iteration to have less probability to explore

        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements

        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        # :return: dfK1: Degrees of Freedom of King 1, a_k1: Allowed actions for King 1, dfK1_: Squares the King1 is threatening
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        # :return: dfQ1: Degrees of Freedom of the Queen, a_q1: Allowed actions for the Queen, dfQ1_: Squares the Queen is threatening

        fQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        while checkmate == 0 and draw == 0:
            R = 0  # Reward

            # Player 1

            # Actions & allowed_actions
            #Directions: down, up, right, left, down-right, down-left, up-right, up-left p1 king and queen directions to move
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])

            # Index postions of each available action in tge list of directions in a
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested.

            #x = np.array([x[0:16],x[16:32],x[32:48],np.asarray(x[48]),np.asarray(x[49])])
            Q, secondWB, firstRelu, firstWB = Q_values(x, W1, W2, bias_W1,
                                                       bias_W2)
            """
            YOUR CODE STARTS HERE

            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
            be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
            containing all the possible actions. Create a vector called a_agent that contains the index of the action
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
            """

            #Max Qvalue from the network that the player can move
            predictedMove = 0
            sortedOutputs = np.argsort(Q)[::-1]
            for topProb in sortedOutputs[0]:
                if topProb in allowed_a:
                    predictedMove = topProb
                    break

            #Exploration vs exploitation
            eGreedy = 0
            eGreedy = int(
                np.random.rand() < epsilon_f
            )  # with probability epsilon choose action at random if epsilon=0 then always choose Greedy
            if eGreedy:
                a_agent = np.random.choice(
                    allowed_a
                )  # if epsilon > 0 (e-Greedy, chose at random with probability epsilon) choose one at random
            else:

                a_agent = predictedMove  # will result will be Qvalue outputted from network

            #THE CODE ENDS HERE.

            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]
                N_moves_save[n - 1, 0] += 1

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]
                N_moves_save[n - 1, 0] += i

            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward for checkmate

                R_save[n - 1, 0] = R
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last
                iteration of the episode, the agent gave checkmate.
                """

                # ReLU derivative
                def dReLU(input):
                    return 1. * (input > 0)

                newQ = Q.copy()

                # apply reward to q value
                newQ[0][a_agent] = R

                #backpropagation
                dL2o = Q - newQ
                dU2 = dReLU(secondWB)

                #Second layer
                gL2 = np.dot(firstRelu.T, dU2 * dL2o)
                dL2b = dL2o * dU2

                #First layer
                dL1o = np.dot(dL2o, W2.T)
                dU1 = dReLU(firstWB)

                #convert into readable array
                newArray = np.zeros((52, 1))
                count = 0
                for g in np.nditer(x):
                    newArray[count] = g
                    count += 1

                gL1 = np.dot(newArray, dU1 * dL1o)
                dL1b = dL1o * dU1

                #Update weights and biases
                W1 -= eta * gL1
                bias_W1 -= eta * dL1b.sum(axis=0)

                W2 -= eta * gL2
                bias_W2 -= eta * dL2b.sum(axis=0)

                # THE CODE ENDS HERE

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1

                R_save[n - 1, 0] += R
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last
                iteration of the episode, it is a draw.
                """

                # ReLU derivative
                def dReLU(input):
                    return 1. * (input > 0)

                newQ = Q.copy()

                # apply reward to q value
                newQ[0][a_agent] = R

                #backpropagation
                dL2o = Q - newQ
                dU2 = dReLU(secondWB)

                #Second layer
                gL2 = np.dot(firstRelu.T, dU2 * dL2o)
                dL2b = dL2o * dU2

                #First layer
                dL1o = np.dot(dL2o, W2.T)
                dU1 = dReLU(firstWB)

                newArray = np.zeros((52, 1))
                count = 0
                for g in np.nditer(x):
                    newArray[count] = g
                    count += 1

                gL1 = np.dot(newArray, dU1 * dL1o)
                dL1b = dL1o * dU1

                #Update weights and biases
                W1 -= eta * gL1
                bias_W1 -= eta * dL1b.sum(axis=0)

                W2 -= eta * gL2
                bias_W2 -= eta * dL2b.sum(axis=0)

                # YOUR CODE ENDS HERE

                if draw:
                    break
            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(
                    np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]
                N_moves_save[n - 1, 0] += i
            # Update the parameters

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _, _, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)
            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for
            the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last
            iteration of the episode, the match continues.
            """

            # Uncomment this to use SARSA algorithm
            #Max Qvalue from the network that the player can move
            #predictedMove = 0
            #sortedOutputs = np.argsort(Q)[::-1]
            #for topProb in sortedOutputs[0]:
            #    if topProb in allowed_a:
            #        predictedMove = topProb
            #        break;

            #Exploration vs exploitation
            #eGreedy = 0
            #eGreedy = int(np.random.rand() < epsilon_f)  # with probability epsilon choose action at random if epsilon=0 then always choose Greedy
            #if eGreedy:
            #    a_agent = np.random.choice(allowed_a)  # if epsilon > 0 (e-Greedy, chose at random with probability epsilon) choose one at random
            #else:

            #    a_agent = predictedMove # will result will be Qvalue outputted from network

            # ReLU derivative
            def dReLU(input):
                return 1. * (input > 0)

            newQ = Q.copy()
            modelPred = Q_next

            # apply reward to q value- this is q-learning algorithm
            newQ[0][a_agent] = R + gamma * np.max(modelPred)

            #backpropagation
            dL2o = Q - newQ
            dU2 = dReLU(secondWB)

            #Second layer
            gL2 = np.dot(firstRelu.T, dU2 * dL2o)
            dL2b = dL2o * dU2

            #First layer
            dL1o = np.dot(dL2o, W2.T)
            dU1 = dReLU(firstWB)

            newArray = np.zeros((52, 1))
            count = 0
            for g in np.nditer(x):
                newArray[count] = g
                count += 1

            gL1 = np.dot(newArray, dU1 * dL1o)
            dL1b = dL1o * dU1

            W1 -= eta * gL1
            bias_W1 -= eta * dL1b.sum(axis=0)

            W2 -= eta * gL2
            bias_W2 -= eta * dL2b.sum(axis=0)

            # YOUR CODE ENDS HERE

            i += 1

    fontSize = 18
    repetitions = 1  # should be integer, greater than 0; for statistical reasons
    totalRewards = np.zeros((repetitions, N_episodes))
    totalMoves = np.zeros((repetitions, N_episodes))

    totalRewards[0, :] = R_save.T
    totalMoves[0, :] = N_moves_save.T
    print(totalRewards.mean())

    newArray2 = np.zeros((52, 1))
    count = 0
    for g in np.nditer(x):
        newArray2[count] = g
        count += 1

        #Exponentially weighted moving average with alpha input
        def ewma(v, a):

            # Conform to array
            v = np.array(v)
            t = v.size

            # initialise matrix with 1-alpha
            # and a matrix to increse the weights
            wU = np.ones(shape=(t, t)) * (1 - a)
            p = np.vstack([np.arange(i, i - t, -1) for i in range(t)])

            # Produce new weight matrix
            z = np.tril(wU**p, 0)

            # return Exponentially moved average
            return np.dot(z, v[::np.newaxis].T) / z.sum(axis=1)

    # Plot the average reward as a function of the number of trials --> the average has to be performed over the episodes
    plt.figure()
    means = np.mean(ewma(totalRewards, 0.0001), axis=0)
    errors = 2 * np.std(
        ewma(totalRewards, 0.0001), axis=0
    )  # errorbars are equal to twice standard error i.e. std/sqrt(samples)

    plt.plot(np.arange(N_episodes), means)
    plt.xlabel('Episode', fontsize=fontSize)
    plt.ylabel('Average Moves', fontsize=fontSize)
    plt.axis((-(N_episodes / 10.0), N_episodes, -0.1, 1.1))
    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.show()

    plt.figure()
    means2 = np.mean(totalMoves, axis=0)
    errors = 2 * np.std(
        ewma(totalMoves, 0.0001), axis=0
    )  # errorbars are equal to twice standard error i.e. std/sqrt(samples)

    plt.plot(np.arange(N_episodes), means2)
    plt.xlabel('Episode', fontsize=fontSize)
    plt.ylabel('Moves', fontsize=fontSize)
    plt.axis((-(N_episodes / 10.0), N_episodes, -0.1, 1.1))
    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.show()
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board   size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)

    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a

    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)

    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = 50  # Number of neurons of the input layer. TODO: Change this value
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = 32  # Number of neurons of the output layer. TODO: Change this value accordingly

    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """

    w_input_hidden = np.random.rand(n_hidden_layer,n_input_layer)/(n_input_layer * n_hidden_layer)
    normW1 = np.sqrt(np.diag(w_input_hidden.dot(w_input_hidden.T)))
    normW1 = normW1.reshape(n_hidden_layer, -1)
    w_input_hidden = w_input_hidden/normW1
    
    w_hidden_output = np.random.rand(n_output_layer,n_hidden_layer)/(n_hidden_layer * n_output_layer)
    normW2 = np.sqrt(np.diag(w_hidden_output.dot(w_hidden_output.T)))
    normW2 = normW2.reshape(n_output_layer, -1)
    w_hidden_output = w_hidden_output/normW2
    
    bias_W1 = np.zeros((n_hidden_layer))
    bias_W2 = np.zeros((n_output_layer))


    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2   #epsilon for the e-greedy policy
    beta = 0.00005    #epsilon discount factor
    gamma = 0.85      #SARSA Learning discount factor
    eta = 0.0035      #learning rate
    N_episodes = 40000 #Number of games, each game ends when we have a checkmate or a draw
    alpha = 1/10000
    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction, 
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0],
                    [-1, 0],
                    [0, 1],
                    [0, -1],
                    [1, 1],
                    [1, -1],
                    [-1, 1],
                    [-1, -1]])
    
    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.    

#    R_save = np.zeros([N_episodes, 1])
    R_save = np.zeros([N_episodes+1, 1])
    N_moves_save = np.zeros([N_episodes+1, 1])
    
    # END OF SUGGESTIONS
    

    for n in tqdm(range(N_episodes)):
#    for n in (range(N_episodes)):
        epsilon_f = epsilon_0 / (1 + beta * n) #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements

        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
        
        
        Start = np.array([np.random.randint(size_board),np.random.randint(size_board)])   #random start
        s_start = np.ravel_multi_index(Start,dims=(size_board,size_board),order='F')      #conversion in single index
        s_index = s_start

        while checkmate == 0 and draw == 0:
            R = 0  # Reward

            # Player 1

            # Actions & allowed_actions
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]
#            print(a)
#            print(allowed_a)
            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE 
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested. 
            
#            states_matrix = np.eye(size_board*size_board)
    
#            input_matrix = states_matrix[:,s_index].reshape((size_board*size_board),1)
            
            Q, out1 = Q_values(x, w_input_hidden, w_hidden_output, bias_W1, bias_W2)
#            print(Q)
#            print(np.argsort(-Q))
#            print(len(Q))
            """
            YOUR CODE STARTS HERE
            
            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
            be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
            containing all the possible actions. Create a vector called a_agent that contains the index of the action 
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
            """
            
            greedy = (np.random.rand() > epsilon_f)
            
            if greedy:
#                a_agent = np.random.choice(allowed_a)

                max_sort = np.argsort(-Q)
                for i in max_sort:
                    if i in allowed_a:
                        a_agent = i
                        break
                    else:
                        a_agent = np.random.choice(allowed_a)
#                if np.argmax(Q) in allowed_a:
#                    a_agent = np.argmax(Q)
#                else:
#                a_agent = np.argmax(Q)
                    
            else:
                a_agent = np.random.choice(allowed_a)
#                a_agent = a.index(a_agent)
                
#            if action in allowed_a:
#                a_agent = 

#            a_agent = 1  # CHANGE THIS VALUE BASED ON YOUR CODE TO USE EPSILON GREEDY POLICY
            
            #THE CODE ENDS HERE. 

#            print(a_agent)

            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]

            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward for checkmate
                t = R + (gamma * max(Q))

                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, the agent gave checkmate.
                """

                deltaOut = (t-Q) * np.heaviside(Q, 0)
                w_hidden_output += eta * np.outer(deltaOut, out1)
                bias_W2 = eta * deltaOut
                
                deltaHid = np.dot(deltaOut,w_hidden_output) * np.heaviside(out1, 0)
                w_input_hidden = w_input_hidden + eta * np.outer(deltaHid, x)
                bias_W1 = eta * deltaHid
                
                R_save[n+1, 0] = alpha * R + (1-alpha) * R_save[n, 0]
                N_moves_save[n+1, 0] = alpha * i + (1-alpha) * N_moves_save[n, 0]
                # THE CODE ENDS HERE

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1
#                print(Q)
                t = R + (gamma * max(Q))
                
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, it is a draw.
                """

                deltaOut = (t-Q) * np.heaviside(Q, 0)
                w_hidden_output += eta * np.outer(deltaOut, out1)
                bias_W2 = eta * deltaOut
                
                deltaHid = np.dot(deltaOut,w_hidden_output) * np.heaviside(out1, 0)
                w_input_hidden = w_input_hidden + eta * np.outer(deltaHid, x)
                bias_W1 = eta * deltaHid
                
                R_save[n+1, 0] = alpha * R + (1-alpha) * R_save[n, 0]
                N_moves_save[n+1, 0] = alpha * i + (1-alpha) * N_moves_save[n, 0]

                # YOUR CODE ENDS HERE
                

                if draw:
                    break

            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]

            # Update the parameters

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
#            Q_next = Q_values(x_next, W1, W2, bias_W1, bias_W2)
            Q_next, demon = Q_values(x_next, w_input_hidden, w_hidden_output, bias_W1, bias_W2)
            t = R + (gamma * max(Q_next))
            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
            the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
            iteration of the episode, the match continues.
            """

            deltaOut = (t-Q) * np.heaviside(Q, 0)
            w_hidden_output += eta * np.outer(deltaOut, out1)
            bias_W2 = eta * deltaOut
                
            deltaHid = np.dot(deltaOut,w_hidden_output) * np.heaviside(out1, 0)
            w_input_hidden = w_input_hidden + eta * np.outer(deltaHid, x_next)
            bias_W1 = eta * deltaHid

            # YOUR CODE ENDS HERE
            i += 1
#        print(R)
        R_save[n+1, 0] = alpha * R + (1-alpha) * R_save[n, 0]
        N_moves_save[n+1, 0] = alpha * i + (1-alpha) * N_moves_save[n, 0]
    
    return R_save, N_moves_save
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)

    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a

    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)

    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = 50  # Number of neurons of the input layer.
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = 32  # Number of neurons of the output layer.

    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """
    
    
    #initialises weights using a uniform distribution and rescales between layers
    
    W1=np.random.uniform(0,1,(n_hidden_layer,n_input_layer))
    W1=np.divide(W1,np.matlib.repmat(np.sum(W1,1)[:,None],1,n_input_layer))

    W2=np.random.uniform(0,1,(n_output_layer,n_hidden_layer))
    W2=np.divide(W2,np.matlib.repmat(np.sum(W2,1)[:,None],1,n_hidden_layer))
    

    # initialises biases with zeros
    
    bias_W1=np.zeros((n_hidden_layer,))
    bias_W2=np.zeros((n_output_layer,))


    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2   #epsilon for the e-greedy policy
    beta = 0.00005    #epsilon discount factor
    gamma = 0.85      #SARSA Learning discount factor
    eta = 0.0035      #learning rate
    Alpha = 0.0001
    N_episodes = 50000 #Number of games, each game ends when we have a checkmate or a draw

    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction, 
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0],
                    [-1, 0],
                    [0, 1],
                    [0, -1],
                    [1, 1],
                    [1, -1],
                    [-1, 1],
                    [-1, -1]])
    
    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.    

    
    #variables to track the moves per game and reward per game
    R_save = np.zeros([N_episodes])
    N_moves_save = np.zeros([N_episodes])


    Average_Rewards = np.zeros([N_episodes])
    Average_moves = np.zeros([N_episodes])

    for n in range(N_episodes):
        epsilon_f = epsilon_0 / (1 + beta * n) #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements

        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        #variable to store number of moves in a game
        Moves_Counter = 0

        while checkmate == 0 and draw == 0:
            R = 0  # Reward

            # Player 1

            # Actions & allowed_actions
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE 
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested. 
            Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)

            """
            YOUR CODE STARTS HERE
            
            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
            be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
            containing all the possible actions. Create a vector calle da_agent that contains the index of the action 
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
            """
            
            
            #create array to contain Q values of possilbe actions
            Possible_Action = []
            
            #eps-greedy policy implementation
            Greedy = int(np.random.rand() > epsilon_f)             
            if Greedy:
                
               #put q values of possible actions into an array
               for i in allowed_a:
                    Possible_Action.append(Q[i])
            
               #get index of highest q value from possible actions
               Possible_Action = Possible_Action.index(max(Possible_Action))
               #use possible_index index value to select action
               action = allowed_a[Possible_Action]
            else:
                #Pick a random  allowed action
                action = np.random.choice(allowed_a)

            # selects action as that chosen by epsilon greedy
            a_agent = action  
            #THE CODE ENDS HERE. 


            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]
                
                #increments move counter
                Moves_Counter += 1

            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward for checkmate

                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, the agent gave checkmate.
                """
                
                # Backpropagation: output layer -> hidden layer
                out2delta = (R - Q[a_agent]) * np.heaviside(Q[a_agent], 0)
                #update weights and biases
                W2[a_agent] = (W2[a_agent] - (eta * out2delta * out1))
                bias_W2[a_agent] = (bias_W2[a_agent] - (eta * out2delta))
            
                # Backpropagation: hidden -> input layer
                out1delta = np.dot(W2[a_agent], out2delta) * np.heaviside(out1, 0)
                #update weights and biases
                W1 = W1 - (eta * np.outer(out1delta,x))
                bias_W1 = (bias_W1 -  (eta * out1delta))

                #set the reward for the game
                R_save[n] = R
                
                #calculate the running average of the reward per game
                Average_Rewards[n] = np.mean(R_save[:n])
                
                #increments move counter
                Moves_Counter += 1
                
                #set the number of moves for the game
                N_moves_save[n] = Moves_Counter
                #calculate the running average of the moves per game
                Average_moves[n] = np.mean(N_moves_save[:n])
                
                #calculate the exponential moving average of the reward
                if n > 0:
                    R_save[n] = ((1-Alpha) * R_save[n-1]) + (Alpha*R_save[n])
                    
                # THE CODE ENDS HERE

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1

                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, it is a draw.
                """
                
                
                # Backpropagation: output layer -> hidden layer
                out2delta = (R - Q[a_agent]) * np.heaviside(Q[a_agent], 0)
                #update weights and biases
                W2[a_agent] = (W2[a_agent] - (eta * out2delta * out1))
                bias_W2[a_agent] = (bias_W2[a_agent] - (eta * out2delta))
            
                # Backpropagation: hidden -> input layer
                out1delta = np.dot(W2[a_agent], out2delta) * np.heaviside(out1, 0)
                #update weights and biases
                W1 = W1 - (eta * np.outer(out1delta,x))
                bias_W1 = (bias_W1 -  (eta * out1delta))
                

                #set the reward for the game
                R_save[n] = R
                
                #calculate the running average of the reward per game
                Average_Rewards[n] = np.mean(R_save[:n])
                
                #increments move counter
                Moves_Counter += 1
                
                #set the number of moves for the game
                N_moves_save[n] = Moves_Counter
                #calculate the running average of the moves per game
                Average_moves[n] = np.mean(N_moves_save[:n])
                
                #calculate the exponential moving average of the reward
                if n > 0:
                    R_save[n] = ((1-Alpha) * R_save[n-1]) + (Alpha*R_save[n])

                # YOUR CODE ENDS HERE
                if draw:
                    break

            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]


            # Update the parameters

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)

            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
            the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
            iteration of the episode, the match continues.
            """
            
            #increments move counter
            Moves_Counter += 1
            
            #set new actions and allowed actions
            SARSA_a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(SARSA_a > 0)[0]
            
            #create array to contain Q values of possilbe actions
            Possible_Action = []
            
            #eps-greedy policy implementation
            Greedy = int(np.random.rand() > epsilon_f)             
            if Greedy:
                
               #put q values of possible actions into an array
               for i in allowed_a:
                    Possible_Action.append(Q[i])
            
               #get index of highest q value from possible actions
               Possible_Action = Possible_Action.index(max(Possible_Action))
               #use possible_index index value to select action
               action = allowed_a[Possible_Action]
            else:
                #Pick a random  allowed action
                action = np.random.choice(allowed_a)

            # selects new action as that chosen by epsilon greedy
            a_agent = action    
            
            # Backpropagation: output layer -> hidden layer
            out2delta = ((R + (gamma * np.max(Q_next)) - Q[a_agent]) * np.heaviside(Q[a_agent], 0))
            #update weights and biases
            W2[a_agent] = (W2[a_agent] - (eta * out2delta * out1))
            bias_W2[a_agent] = (bias_W2[a_agent] - (eta * out2delta))
        
            # Backpropagation: hidden -> input layer
            out1delta = np.dot(W2[a_agent], out2delta) * np.heaviside(out1, 0)
            #update weights and biases
            W1 = W1 - (eta * np.outer(out1delta,x))
            bias_W1 = (bias_W1 -  (eta * out1delta))

            # YOUR CODE ENDS HERE
            i += 1
    
    fontSize = 18

    print("Results for SARSA learning:")
    
    print("running average of the number of moves per game:")
    
    # plots the running average of the number of moves per game
    plt.plot(Average_moves)
    #set axis labels
    plt.xlabel('Number of episodes', fontsize = fontSize)
    plt.ylabel('Average Moves Per Game', fontsize = fontSize)
    #display plot
    plt.show()
    
    print("running average of the reward per game:")
    
    #plot running average of rewards
    #plt.plot(Average_Rewards)
    
    # plots the exponential moving average of the reward per game
    plt.plot(R_save)
    #set axis labels
    plt.xlabel('Number of episodes', fontsize = fontSize)
    plt.ylabel('Average Reward Per Game', fontsize = fontSize)
    #display plot
    plt.show()
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)
    """
    Possible is for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible is for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of is for Player 1 = is of King + is of Queen
    N_a = possible_king_a + possible_queen_a
    """
    Possible is of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed is for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible is of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible is as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible is of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible is as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)
    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = 50  # Number of neurons of the input layer. TODO: Change this value
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = 32  # Number of neurons of the output layer. TODO: Change this value accordingly
    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """

    W1 = np.random.rand(n_input_layer, n_hidden_layer) / float(
        n_input_layer * n_hidden_layer)
    W2 = np.random.rand(n_hidden_layer, n_output_layer) / float(
        n_hidden_layer * n_output_layer)
    bias_W1 = np.zeros(n_hidden_layer)
    bias_W2 = np.zeros(n_output_layer)

    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2  #epsilon for the e-greedy policy
    beta = 0.00005  #epsilon discount factor
    gamma = 0.85  #SARSA Learning discount factor
    eta = 0.0035  #learning rate
    N_episodes = 100000  #Number of games, each game ends when we have a checkmate or a draw

    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1],
                    [-1, -1]])

    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.

    R_save = np.zeros([N_episodes, 1])
    N_moves_save = np.zeros([N_episodes, 1])

    # END OF SUGGESTIONS

    c = 1  # counter for games
    moves = list()
    rewards = list()
    for n in range(N_episodes):
        next_computed = False

        if c % 1000 == 0:
            print(c)

        epsilon_f = epsilon_0 / (
            1 + beta * n
        )  #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements

        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible is of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible is of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible is of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        while checkmate == 0 and draw == 0:
            R = 0  # Reward

            # Player 1

            # is & allowed_is
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested.
            Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)
            """
            YOUR CODE STARTS HERE
            
            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the i must
            be chosen from the a_allowed vector. The index of this i must be remapped to the index of the vector a,
            containing all the possible is. Create a vector calle da_agent that contains the index of the i 
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third i, a_agent=32 not 3.
            """

            possible_moves = Q[allowed_a]

            ### Comment out the implementation you don't want to use. ###

            # Implementation of Q-Learning
            eGreedy = int(np.random.rand() < epsilon_f)
            if eGreedy:
                ind = np.random.randint(len(possible_moves))
                a_agent = allowed_a[ind]
            else:
                ind = possible_moves.argmax()
                a_agent = allowed_a[ind]

            # # Implementation of SARSA
            # ind = np.random.randint(len(possible_moves))
            # a_agent = allowed_a[ind]

            action_chosen = [0] * 32
            action_chosen[a_agent] = 1

            #THE CODE ENDS HERE.
            # Player 1 makes the i
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]

            # Compute the allowed is for the new position

            # Possible is of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible is of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible is of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                c += 1
                R = 1  # Reward for checkmate
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the i made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, the agent gave checkmate.
                """
                if next_computed:
                    x = x.reshape(1, -1)
                    out1 = out1.reshape(1, -1)
                    Q = Q.reshape(1, -1)
                    d_i = (
                        (R + gamma * Q_next.max()) - Q) * H(Q) * action_chosen
                    d_j = np.dot(d_i, W2.T) * H(out1)

                    delta_weight_i = eta * np.dot(out1.T, d_i)
                    delta_bias_i = eta * d_i[0]
                    delta_weight_j = eta * np.dot(x.T, d_j)
                    delta_bias_j = eta * d_j[0]

                    W2 = W2 + delta_weight_i
                    bias_W2 = bias_W2 + delta_bias_i
                    W1 = W1 + delta_weight_j
                    bias_W1 = bias_W1 + delta_bias_j

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                c += 1
                R = 0.1
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the i made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, it is a draw.
                """
                if next_computed:
                    x = x.reshape(1, -1)
                    out1 = out1.reshape(1, -1)
                    Q = Q.reshape(1, -1)
                    d_i = (
                        (R + gamma * Q_next.max()) - Q) * H(Q) * action_chosen
                    d_j = np.dot(d_i, W2.T) * H(out1)

                    delta_weight_i = eta * np.dot(out1.T, d_i)
                    delta_bias_i = eta * d_i[0]
                    delta_weight_j = eta * np.dot(x.T, d_j)
                    delta_bias_j = eta * d_j[0]

                    W2 += delta_weight_i
                    bias_W2 += delta_bias_i
                    W1 += delta_weight_j
                    bias_W1 += delta_bias_j

                # YOUR CODE ENDS HERE

                if draw:
                    break

            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(
                    np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]

            # Update the parameters

            # Possible is of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible is of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible is of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)

            next_computed = True
            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
            the i made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
            iteration of the episode, the match continues.
            """

            if not check or draw:
                x = x.reshape(1, -1)
                out1 = out1.reshape(1, -1)
                Q = Q.reshape(1, -1)
                d_i = ((R + gamma * Q_next.max()) - Q) * H(Q) * action_chosen
                d_j = np.dot(d_i, W2.T) * H(out1)

                delta_weight_i = eta * np.dot(out1.T, d_i)
                delta_bias_i = eta * d_i[0]
                delta_weight_j = eta * np.dot(x.T, d_j)
                delta_bias_j = eta * d_j[0]

                W2 = W2 + delta_weight_i
                bias_W2 = bias_W2 + delta_bias_i
                W1 = W1 + delta_weight_j
                bias_W1 = bias_W1 + delta_bias_j

            # YOUR CODE ENDS HERE
            i += 1
        moves.append(i)
        rewards.append(R)

    # Comput moving averages over a sliding window
    mv_am = list()
    mv_rewards = list()
    for i, item in enumerate(rewards):
        if i > 250 and i < len(rewards) - 250:
            average_r = 0
            average_mo = 0
            for j in range(-250, 250):
                average_mo += moves[i + j]
                average_r += rewards[i + j]
            average_mo /= 500
            average_r /= 500
            mv_am.append(average_mo)
            mv_rewards.append(average_r)
    f, axarr = plt.subplots(1, 2, figsize=(20, 10))

    axarr[0].plot(range(0, len(mv_am)), mv_am)
    axarr[0].set_title("Moving average: Moves")
    axarr[1].plot(range(0, len(mv_rewards)), mv_rewards)
    axarr[1].set_title("Moving average: Rewards")

    for i in range(0, 2):
        plt.setp(axarr[i].get_xticklabels(), fontsize=16)
        plt.setp(axarr[i].get_yticklabels(), fontsize=16)
    plt.tight_layout()
    plt.show()

    # Print results to a file so that we can read and plot together
    result_string_moves = ""
    result_string_rewards = ""
    for i, item in enumerate(mv_am):
        result_string_moves += str(item) + ","
        result_string_rewards += str(mv_rewards[i]) + ","
    result_string_moves += "\n"
    result_string_rewards += "\n"

    with open("results.txt", "w") as f:
        f.write(result_string_moves + result_string_rewards)
示例#20
0
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)
    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a
    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)
    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = len(x)  # Number of neurons of the input layer.
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = N_a  # Number of neurons of the output layer.
    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """
    # Initialise weights and biases
    W1 = np.random.uniform(0, 1, (n_hidden_layer, n_input_layer))
    W1 /= (n_input_layer * n_hidden_layer)
    W2 = np.random.uniform(0, 1, (n_output_layer, n_hidden_layer))
    W2 /= (n_hidden_layer * n_output_layer)
    bias_W1 = np.zeros(n_hidden_layer)[:, np.newaxis]
    bias_W2 = np.zeros(n_output_layer)[:, np.newaxis]

    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2  #epsilon for the e-greedy policy
    beta = 0.00005  #epsilon discount factor
    gamma = 0.85  #SARSA Learning discount factor
    eta = 0.0035  #learning rate
    N_episodes = 100000  #Number of games, each game ends when we have a checkmate or a draw
    alpha = 1 / 10000  #Moving average discount factor
    sarsa = False  #Set to true for SARSA
    rmsprop = True  #Set to true for RMSprop

    # RMSprop Parameters
    if rmsprop:
        eta = 0.0001
        rmsprop_gamma = 0.9
        rmsprop_eps = 1e-8

    # Initialise RMSProp gradient accumulations
    if rmsprop:
        avg_W1 = np.zeros_like(W1)
        avg_W2 = np.zeros_like(W2)
        avg_bias_W1 = np.zeros_like(bias_W1)
        avg_bias_W2 = np.zeros_like(bias_W2)

    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0], [-1, 0], [0, 1], [0, -1], [1, 1], [1, -1], [-1, 1],
                    [-1, -1]])

    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.

    R_save = np.zeros([N_episodes, 1])
    N_moves_save = np.zeros([N_episodes, 1])
    l2_norm = np.zeros([N_episodes, 1])

    # END OF SUGGESTIONS

    for n in range(N_episodes):
        if n % 10000 == 0:
            print(n)
        epsilon_f = epsilon_0 / (
            1 + beta * n
        )  #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements

        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        while checkmate == 0 and draw == 0:
            R = 0  # Reward

            # Player 1

            # Actions & allowed_actions
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested.
            Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)
            """
            YOUR CODE STARTS HERE
            
            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
            be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
            containing all the possible actions. Create a vector called a_agent that contains the index of the action 
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
            """

            # epsilon-greedy policy
            a_agent = epsilon_greedy(epsilon_f, Q, allowed_a)
            #THE CODE ENDS HERE.

            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]

            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward for checkmate
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, the agent gave checkmate.
                """

                if rmsprop:
                    W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0)
                    W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta)

                    W2_grad = np.outer(W2_delta, out1)
                    W1_grad = np.outer(W1_delta, x)

                    # W2
                    avg_W2 = rmsprop_gamma * avg_W2 + (
                        1 - rmsprop_gamma) * np.power(W2_grad, 2)
                    W2 += eta * W2_grad / np.sqrt(avg_W2 + rmsprop_eps)

                    # Bias W2
                    avg_bias_W2 = rmsprop_gamma * avg_bias_W2 + (
                        1 - rmsprop_gamma) * np.power(W2_delta, 2)
                    bias_W2 += eta * W2_delta / np.sqrt(avg_bias_W2 +
                                                        rmsprop_eps)

                    # W1
                    avg_W1 = rmsprop_gamma * avg_W1 + (
                        1 - rmsprop_gamma) * np.power(W1_grad, 2)
                    W1 += eta * W1_grad / np.sqrt(avg_W1 + rmsprop_eps)

                    # Bias W1
                    avg_bias_W1 = rmsprop_gamma * avg_bias_W1 + (
                        1 - rmsprop_gamma) * np.power(W1_delta, 2)
                    bias_W1 += eta * W1_delta / np.sqrt(avg_bias_W1 +
                                                        rmsprop_eps)

                else:
                    W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0)
                    W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta)

                    W2 += eta * np.outer(W2_delta, out1)
                    bias_W2 += eta * W2_delta

                    W1 += eta * np.outer(W1_delta, x)
                    bias_W1 += eta * W1_delta

                # THE CODE ENDS HERE

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1
                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, it is a draw.
                """

                if rmsprop:
                    W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0)
                    W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta)

                    W2_grad = np.outer(W2_delta, out1)
                    W1_grad = np.outer(W1_delta, x)

                    # W2
                    avg_W2 = rmsprop_gamma * avg_W2 + (
                        1 - rmsprop_gamma) * np.power(W2_grad, 2)
                    W2 += eta * W2_grad / np.sqrt(avg_W2 + rmsprop_eps)

                    # Bias W2
                    avg_bias_W2 = rmsprop_gamma * avg_bias_W2 + (
                        1 - rmsprop_gamma) * np.power(W2_delta, 2)
                    bias_W2 += eta * W2_delta / np.sqrt(avg_bias_W2 +
                                                        rmsprop_eps)

                    # W1
                    avg_W1 = rmsprop_gamma * avg_W1 + (
                        1 - rmsprop_gamma) * np.power(W1_grad, 2)
                    W1 += eta * W1_grad / np.sqrt(avg_W1 + rmsprop_eps)

                    # Bias W1
                    avg_bias_W1 = rmsprop_gamma * avg_bias_W1 + (
                        1 - rmsprop_gamma) * np.power(W1_delta, 2)
                    bias_W1 += eta * W1_delta / np.sqrt(avg_bias_W1 +
                                                        rmsprop_eps)

                else:
                    W2_delta = (R - Q[a_agent]) * np.heaviside(Q, 0)
                    W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta)

                    W2 += eta * np.outer(W2_delta, out1)
                    bias_W2 += eta * W2_delta

                    W1 += eta * np.outer(W1_delta, x)
                    bias_W1 += eta * W1_delta

                if draw:
                    break

            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(
                    np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]

            # Update the parameters

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s,
                                                     p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)
            # Compute the allowed actions from the next state
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]
            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
            the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
            iteration of the episode, the match continues.
            """
            if sarsa:
                # if SARSA, choose next action based on policy
                next_Q_value = Q_next[epsilon_greedy(epsilon_f, Q_next,
                                                     allowed_a)]
            else:
                # if Q-learning choose action with maximum Q value
                next_Q_value = max(Q_next[allowed_a])

            if rmsprop:
                W2_delta = (R + gamma * next_Q_value -
                            Q[a_agent]) * np.heaviside(Q, 0)
                W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta)

                W2_grad = np.outer(W2_delta, out1)
                W1_grad = np.outer(W1_delta, x)

                # W2
                avg_W2 = rmsprop_gamma * avg_W2 + (
                    1 - rmsprop_gamma) * np.power(W2_grad, 2)
                W2 += eta * W2_grad / np.sqrt(avg_W2 + rmsprop_eps)

                # Bias W2
                avg_bias_W2 = rmsprop_gamma * avg_bias_W2 + (
                    1 - rmsprop_gamma) * np.power(W2_delta, 2)
                bias_W2 += eta * W2_delta / np.sqrt(avg_bias_W2 + rmsprop_eps)

                # W1
                avg_W1 = rmsprop_gamma * avg_W1 + (
                    1 - rmsprop_gamma) * np.power(W1_grad, 2)
                W1 += eta * W1_grad / np.sqrt(avg_W1 + rmsprop_eps)

                # Bias W1
                avg_bias_W1 = rmsprop_gamma * avg_bias_W1 + (
                    1 - rmsprop_gamma) * np.power(W1_delta, 2)
                bias_W1 += eta * W1_delta / np.sqrt(avg_bias_W1 + rmsprop_eps)

            else:
                W2_delta = (R + gamma * next_Q_value -
                            Q[a_agent]) * np.heaviside(Q, 0)
                W1_delta = np.heaviside(out1, 0) * np.dot(W2.T, W2_delta)

                W2 += eta * np.outer(W2_delta, out1)
                bias_W2 += eta * W2_delta

                W1 += eta * np.outer(W1_delta, x)
                bias_W1 += eta * W1_delta

            # YOUR CODE ENDS HERE
            i += 1

        # Save the reward per episode and number of moves per episode
        if n == 0:
            N_moves_save[n, 0] = 80
            R_save[n, 0] = 0
        else:
            N_moves_save[n,
                         0] = alpha * i + (1 - alpha) * N_moves_save[n - 1, 0]
            R_save[n, 0] = alpha * R + (1 - alpha) * R_save[n - 1, 0]

        #l2_norm[n, 0] = np.linalg.norm(W2, ord=2)

    # Save the reward per episode in a file
    with open('rewards.pickle', 'wb') as file:
        pickle.dump(R_save, file)
示例#21
0
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 25 23:55:33 2016

@author: Agus
"""
import pickle
from features import *
from algorithm import *
import time
# No creo q haga falta importar los reductores por separado si se importa algorithms, pero por si acaso
from sklearn.decomposition import PCA

# Calculamos los features
df = features()
#df = pd.concat([df.iloc[:60, :], df.iloc[71910:, :]], ignore_index=True) #Omitir esta línea para la version real

# Preparamos data para clasificar
X = df.iloc[:, 1:].values
Y = df['class']


# Entrenamiento Reduccion de dimensionalidad
print 'Algoritmo randomforest con PCA'
print 'Entrenamiento PCA'

Cant_Atributos = len(df.columns) - 1
components = int(220)

pca = PCA(n_components=components, copy='False')
start_time = time.time()
示例#22
0
def main():
    """
    Generate a new game
    The function below generates a new chess board with King, Queen and Enemy King pieces randomly assigned so that they
    do not cause any threats to each other.
    s: a size_board x size_board matrix filled with zeros and three numbers:
    1 = location of the King
    2 = location of the Queen
    3 = location fo the Enemy King
    p_k2: 1x2 vector specifying the location of the Enemy King, the first number represents the row and the second
    number the colunm
    p_k1: same as p_k2 but for the King
    p_q1: same as p_k2 but for the Queen
    """
    s, p_k2, p_k1, p_q1 = generate_game(size_board)

    """
    Possible actions for the Queen are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right) multiplied by the number of squares that the Queen can cover in one movement which equals the size of 
    the board - 1
    """
    possible_queen_a = (s.shape[0] - 1) * 8
    """
    Possible actions for the King are the eight directions (down, up, right, left, up-right, down-left, up-left, 
    down-right)
    """
    possible_king_a = 8

    # Total number of actions for Player 1 = actions of King + actions of Queen
    N_a = possible_king_a + possible_queen_a

    """
    Possible actions of the King
    This functions returns the locations in the chessboard that the King can go
    dfK1: a size_board x size_board matrix filled with 0 and 1.
          1 = locations that the king can move to
    a_k1: a 8x1 vector specifying the allowed actions for the King (marked with 1): 
          down, up, right, left, down-right, down-left, up-right, up-left
    """
    dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Queen
    Same as the above function but for the Queen. Here we have 8*(size_board-1) possible actions as explained above
    """
    dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
    """
    Possible actions of the Enemy King
    Same as the above function but for the Enemy King. Here we have 8 possible actions as explained above
    """
    dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

    """
    Compute the features
    x is a Nx1 vector computing a number of input features based on which the network should adapt its weights  
    with board size of 4x4 this N=50
    """
    x = features(p_q1, p_k1, p_k2, dfK2, s, check)

    """
    Initialization
    Define the size of the layers and initialization
    FILL THE CODE
    Define the network, the number of the nodes of the hidden layer should be 200, you should know the rest. The weights 
    should be initialised according to a uniform distribution and rescaled by the total number of connections between 
    the considered two layers. For instance, if you are initializing the weights between the input layer and the hidden 
    layer each weight should be divided by (n_input_layer x n_hidden_layer), where n_input_layer and n_hidden_layer 
    refer to the number of nodes in the input layer and the number of nodes in the hidden layer respectively. The biases
     should be initialized with zeros.
    """
    n_input_layer = 50  # Number of neurons of the input layer. TODO: Change this value
    n_hidden_layer = 200  # Number of neurons of the hidden layer
    n_output_layer = 32  # Number of neurons of the output layer. TODO: Change this value accordingly

    """
    TODO: Define the w weights between the input and the hidden layer and the w weights between the hidden layer and the 
    output layer according to the instructions. Define also the biases.
    """

    W1 = np.random.rand(n_input_layer, n_hidden_layer) / (n_input_layer * n_hidden_layer)
    W2 = np.random.rand(n_hidden_layer, n_output_layer) / (n_hidden_layer * n_output_layer)
    # W1 = np.random.rand(n_input_layer, n_hidden_layer)
    # W2 = np.random.rand(n_hidden_layer, n_output_layer)

    bias_W1 = np.zeros(n_hidden_layer)
    bias_W2 = np.zeros(n_output_layer)


    # YOUR CODES ENDS HERE

    # Network Parameters
    epsilon_0 = 0.2   #epsilon for the e-greedy policy
    beta = 0.00005    #epsilon discount factor
    gamma = 0.85      #SARSA Learning discount factor
    eta = 0.0035      #learning rate
    N_episodes = 100000 #Number of games, each game ends when we have a checkmate or a draw

    ###  Training Loop  ###

    # Directions: down, up, right, left, down-right, down-left, up-right, up-left
    # Each row specifies a direction,
    # e.g. for down we need to add +1 to the current row and +0 to current column
    map = np.array([[1, 0],
                    [-1, 0],
                    [0, 1],
                    [0, -1],
                    [1, 1],
                    [1, -1],
                    [-1, 1],
                    [-1, -1]])

    # THE FOLLOWING VARIABLES COULD CONTAIN THE REWARDS PER EPISODE AND THE
    # NUMBER OF MOVES PER EPISODE, FILL THEM IN THE CODE ABOVE FOR THE
    # LEARNING. OTHER WAYS TO DO THIS ARE POSSIBLE, THIS IS A SUGGESTION ONLY.

    R_save = []
    N_moves_save = []

    last_100_r = []
    last_100_moves = []

    # END OF SUGGESTIONS


    for n in range(N_episodes):
        epsilon_f = epsilon_0 / (1 + beta * n) #psilon is discounting per iteration to have less probability to explore
        checkmate = 0  # 0 = not a checkmate, 1 = checkmate
        draw = 0  # 0 = not a draw, 1 = draw
        i = 1  # counter for movements

        # Generate a new game
        s, p_k2, p_k1, p_q1 = generate_game(size_board)

        # Possible actions of the King
        dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
        # Possible actions of the Queen
        dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
        # Possible actions of the enemy king
        dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

        # TODO: STATS
        last_100_moves.append(np.mean(N_moves_save[-1000:]))
        last_100_r.append(np.mean(R_save[-100:]))



        print(n, last_100_moves[-1], last_100_r[-1])

        if n % 5000 == 0:
            plt.plot(last_100_moves[1000:])
            plt.show()
            plt.plot(last_100_r)
            plt.show()

        # TODO: STATS

        while checkmate == 0 and draw == 0:
            R = 0  # Reward

            # Player 1

            # Actions & allowed_actions
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            # Computing Features
            x = features(p_q1, p_k1, p_k2, dfK2, s, check)

            # FILL THE CODE
            # Enter inside the Q_values function and fill it with your code.
            # You need to compute the Q values as output of your neural
            # network. You can change the input of the function by adding other
            # data, but the input of the function is suggested.
            Q, out1 = Q_values(x, W1, W2, bias_W1, bias_W2)

            """
            YOUR CODE STARTS HERE
            
            FILL THE CODE
            Implement epsilon greedy policy by using the vector a and a_allowed vector: be careful that the action must
            be chosen from the a_allowed vector. The index of this action must be remapped to the index of the vector a,
            containing all the possible actions. Create a vector calle da_agent that contains the index of the action 
            chosen. For instance, if a_allowed = [8, 16, 32] and you select the third action, a_agent=32 not 3.
            """
            available_Qs = np.take(Q, allowed_a)
            a_max = np.argmax(available_Qs)
            action_max = allowed_a[a_max]


            p = np.random.rand()
            if p < epsilon_f:
                a_agent = np.random.choice(allowed_a)
            else:
                a_agent = action_max

            #THE CODE ENDS HERE.


            # Player 1 makes the action
            if a_agent < possible_queen_a:
                direction = int(np.ceil((a_agent + 1) / (size_board - 1))) - 1
                steps = a_agent - direction * (size_board - 1) + 1

                s[p_q1[0], p_q1[1]] = 0
                mov = map[direction, :] * steps
                s[p_q1[0] + mov[0], p_q1[1] + mov[1]] = 2
                p_q1[0] = p_q1[0] + mov[0]
                p_q1[1] = p_q1[1] + mov[1]

            else:
                direction = a_agent - possible_queen_a
                steps = 1

                s[p_k1[0], p_k1[1]] = 0
                mov = map[direction, :] * steps
                s[p_k1[0] + mov[0], p_k1[1] + mov[1]] = 1
                p_k1[0] = p_k1[0] + mov[0]
                p_k1[1] = p_k1[1] + mov[1]

            # Compute the allowed actions for the new position

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)

            # Player 2

            # Check for draw or checkmate
            if np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 1:
                # King 2 has no freedom and it is checked
                # Checkmate and collect reward
                checkmate = 1
                R = 1  # Reward for checkmate

                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, the agent gave checkmate.
                """

                R_save.append(R)
                N_moves_save.append(i)

                target_q = R

                # Backpropogation: output layer -> hidden layer
                delta_output = (target_q - Q[a_agent]) * np.heaviside(Q[a_agent], 0)

                delta_weights_out = eta * delta_output * out1
                delta_biases_out = eta * delta_output

                W2[:, a_agent] += delta_weights_out
                bias_W2[a_agent] += delta_biases_out

                # Backpropogation: hidden -> input layer

                delta_output_2 = np.zeros(n_output_layer)
                delta_output_2[a_agent] = delta_output

                delta_hidden = np.heaviside(out1, 0) * np.dot(W2, delta_output_2)

                delta_hidden_weights = eta * np.outer(x, delta_hidden)
                delta_hidden_biases = eta * delta_hidden

                W1 += delta_hidden_weights
                bias_W1 += delta_hidden_biases

                # THE CODE ENDS HERE

                if checkmate:
                    break

            elif np.sum(dfK2) == 0 and dfQ1_[p_k2[0], p_k2[1]] == 0:
                # King 2 has no freedom but it is not checked
                draw = 1
                R = 0.1

                """
                FILL THE CODE
                Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
                rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
                the action made. You computed previously Q values in the Q_values function. Be careful: this is the last 
                iteration of the episode, it is a draw.
                """
                R_save.append(R)
                N_moves_save.append(i)

                target_q = R

                # Backpropogation: output layer -> hidden layer
                delta_output = (target_q - Q[a_agent]) * np.heaviside(Q[a_agent], 0)

                # delta_weights_out = eta * np.outer(delta_output, out1)
                delta_weights_out = eta * delta_output * out1
                delta_biases_out = eta * delta_output

                W2[:, a_agent] += delta_weights_out
                bias_W2[a_agent] += delta_biases_out

                # Backpropogation: hidden -> input layer

                delta_output_2 = np.zeros(n_output_layer)
                delta_output_2[a_agent] = delta_output

                delta_hidden = np.heaviside(out1, 0) * np.dot(W2, delta_output_2)

                delta_hidden_weights = eta * np.outer(x, delta_hidden)
                delta_hidden_biases = eta * delta_hidden

                W1 += delta_hidden_weights
                bias_W1 += delta_hidden_biases

                # YOUR CODE ENDS HERE

                if draw:
                    break

            else:
                # Move enemy King randomly to a safe location
                allowed_enemy_a = np.where(a_k2 > 0)[0]
                a_help = int(np.ceil(np.random.rand() * allowed_enemy_a.shape[0]) - 1)
                a_enemy = allowed_enemy_a[a_help]

                direction = a_enemy
                steps = 1

                s[p_k2[0], p_k2[1]] = 0
                mov = map[direction, :] * steps
                s[p_k2[0] + mov[0], p_k2[1] + mov[1]] = 3

                p_k2[0] = p_k2[0] + mov[0]
                p_k2[1] = p_k2[1] + mov[1]

            # Update the parameters

            # Possible actions of the King
            dfK1, a_k1, _ = degree_freedom_king1(p_k1, p_k2, p_q1, s)
            # Possible actions of the Queen
            dfQ1, a_q1, dfQ1_ = degree_freedom_queen(p_k1, p_k2, p_q1, s)
            # Possible actions of the enemy king
            dfK2, a_k2, check = degree_freedom_king2(dfK1, p_k2, dfQ1_, s, p_k1)
            # Compute features
            x_next = features(p_q1, p_k1, p_k2, dfK2, s, check)
            # Compute Q-values for the discounted factor
            Q_next, _ = Q_values(x_next, W1, W2, bias_W1, bias_W2)

            """
            FILL THE CODE
            Update the parameters of your network by applying backpropagation and Q-learning. You need to use the 
            rectified linear function as activation function (see supplementary materials). Exploit the Q value for 
            the action made. You computed previously Q values in the Q_values function. Be careful: this is not the last 
            iteration of the episode, the match continues.
            """
            a = np.concatenate([np.array(a_q1), np.array(a_k1)])
            allowed_a = np.where(a > 0)[0]

            available_Qs_next = np.take(Q_next, allowed_a)
            a_max_next = np.argmax(available_Qs_next)
            action_max_next = allowed_a[a_max_next]

            # error = (1 / 2) * (R + gamma * Q_next[a_max_next] - Q[a_max]) ^ 2
            target_q = (R + gamma * Q_next[action_max_next])

            # Backpropogation: output layer -> hidden layer
            delta_output = (target_q - Q[a_agent]) * np.heaviside(Q[a_agent], 0)

            # delta_weights_out = eta * np.outer(delta_output, out1)
            delta_weights_out = eta * delta_output * out1
            delta_biases_out = eta * delta_output

            W2[:, a_agent] += delta_weights_out
            bias_W2[a_agent] += delta_biases_out

            # Backpropogation: hidden -> input layer

            delta_output_2 = np.zeros(n_output_layer)
            delta_output_2[a_agent] = delta_output

            delta_hidden = np.heaviside(out1, 0) * np.dot(W2, delta_output_2)

            delta_hidden_weights = eta * np.outer(x, delta_hidden)
            delta_hidden_biases = eta * delta_hidden

            W1 += delta_hidden_weights
            bias_W1 += delta_hidden_biases

            # YOUR CODE ENDS HERE
            i += 1