示例#1
0
def train_with_goals(noise=0, iterations=10000, learning_rate=0.1):
    model = nn.ElmanGoalNet(size_hidden=15,
                            size_observation=9,
                            size_action=8,
                            size_goal1=2,
                            size_goal2=0)
    num_episodes = iterations
    model.learning_rate = learning_rate
    model.L2_regularization = 0.

    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_goals = 0.

    for episode in range(num_episodes):
        seqid = utils.idx_from_probabilities(
            pnas2018task.sequence_probabilities)

        goal = pnas2018task.goals[seqid]
        sequence = pnas2018task.seqs[seqid]
        inputs = utils.liststr_to_onehot(sequence[:-1],
                                         pnas2018task.all_inputs)
        targets = utils.liststr_to_onehot(sequence[1:],
                                          pnas2018task.all_outputs)
        model.action = np.zeros((1, model.size_action), dtype=np.float32)
        # run the network
        with tf.GradientTape() as tape:
            # Initialize context with random/uniform values.
            model.context = np.zeros((1, model.size_hidden), dtype=np.float32)
            model.goal1 = goal[0]
            for i in range(len(targets)):
                model.action = np.zeros((1, model.size_action),
                                        dtype=np.float32)
                # Add noise
                model.context += np.float32(
                    np.random.normal(0., noise, size=(1, model.size_hidden)))
                observation = inputs[i].reshape(1, -1)
                model.feedforward(observation)

            # Get some statistics about what was correct and what wasn't
            tchoices = np.array(model.h_action_wta).reshape(
                (-1, len(targets[0])))
            ratios = scripts.evaluate([tchoices], [targets])
            loss, _ = model.train_obsolete(targets, goal, None, tape)
        # Monitor progress using rolling averages.
        speed = 2. / (
            episode + 2
        ) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
        rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
        rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0], speed)
        rng_avg_goals = utils.rolling_avg(
            rng_avg_goals, ratios[0] == 1,
            speed)  # whole action sequence correct ?
        # Display on the console at regular intervals
        if (episode < 1000 and episode in [3 ** n for n in range(50)]) or episode % 1000 == 0 \
                or episode + 1 == num_episodes:
            print(
                "{0}: avg loss={1}, \tactions={2}, \tfull_sequence={3}".format(
                    episode, rng_avg_loss, rng_avg_actions, rng_avg_goals))
    return model
示例#2
0
def train(model=None, noise=0., iterations=5000, l1reg=0.0, l2reg= 0.0, algorithm=nn.SGD,
          size_hidden=15, learning_rate=None, loss_type='cross_entropy',
          initial_context=pnas2018.ZEROS):
    if model is None:
        model = nn.ElmanGoalNet(size_hidden=size_hidden, size_observation=len(rewardtask.all_inputs),
                                size_action=len(rewardtask.all_outputs), size_goal1=0, size_goal2=0,
                                algorithm=algorithm, initialization="normal")
    num_episodes = iterations
    if learning_rate is not None:  # Else keep the model's learning rate
        model.learning_rate = learning_rate
    model.L1_regularization = l1reg
    model.L2_regularization = l2reg

    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_sequence = 0.

    for episode in range(num_episodes):
        model.new_episode(initial_context=initial_context)
        seqid = utils.idx_from_probabilities(rewardtask.sequence_probabilities)

        sequence = rewardtask.seqs[seqid]
        inputs = utils.liststr_to_onehot(sequence[:-1], rewardtask.all_inputs)
        targets = utils.liststr_to_onehot(sequence[1:], rewardtask.all_outputs)
        # run the network
        with tf.GradientTape(persistent=True) as tape:
            for i in range(len(targets)):
                model.action = np.zeros((1, model.size_action), dtype=np.float32)
                model.context += np.float32(np.random.normal(0., noise, size=(1, model.size_hidden)))
                observation = inputs[i].reshape(1, -1)
                model.feedforward(observation)

            #if episode % 2 == 0:
                # Get some statistics about what was correct and what wasn't
            tchoices = np.array(model.h_action_wta).reshape((-1, len(targets[0])))
            ratios = scripts.evaluate([tchoices], [targets])
            # Train model, record loss.
            if loss_type==pnas2018.MSE:
                loss, _ = model.train_MSE(targets, None, None, tape)
            elif loss_type==pnas2018.CROSS_ENTROPY:
                loss, _ = model.train_obsolete(targets, None, None, tape)
            else:
                loss, _ = model.train(tape, targets)
        del tape

        #if episode % 2 == 0:
            # Monitor progress using rolling averages.
        speed = 2. / (episode + 2) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
        rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
        rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0], speed)
        rng_avg_sequence = utils.rolling_avg(rng_avg_sequence, ratios[0] == 1,
                                          speed)  # whole action sequence correct ?
        # Display on the console at regular intervals
        if (episode < 1000 and episode in [3 ** n for n in range(50)]) or episode % 1000 == 0 \
                or episode + 1 == num_episodes:
            print(
                "{0}: avg loss={1}, \tactions={2}, \tfull_sequence={3}".format(
                    episode, rng_avg_loss, rng_avg_actions, rng_avg_sequence))
    return model, rng_avg_sequence
示例#3
0
def train_predictive_net(model=None, iterations=5000, learning_rate=0.1, algorithm=nn.RMSPROP, hidden_units=15):
    if model is None:
        model = nn.PredictiveNet(size_hidden=hidden_units, algorithm=algorithm,
                                 size_observation=len(pnas2018task.all_inputs),
                                 size_action=len(pnas2018task.all_outputs))
    num_episodes = iterations
    model.learning_rate = learning_rate

    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_full_seq = 0.
    rng_avg_preds = 0.

    for episode in range(num_episodes):
        seqid = utils.idx_from_probabilities(pnas2018task.sequence_probabilities)
        sequence = pnas2018task.seqs[seqid]
        inputs = utils.liststr_to_onehot(sequence[:-1], pnas2018task.all_inputs)
        action_targets = utils.liststr_to_onehot(sequence[1:], pnas2018task.all_outputs)
        prediction_targets = utils.liststr_to_onehot(sequence[1:], pnas2018task.all_inputs)
        model.action = np.zeros((1, model.size_action), dtype=np.float32)
        model.prediction_linear = np.zeros((1, model.size_observation), dtype=np.float32)  #initial prediction = 0
        # run the network
        # Initialize context with random/uniform values.
        with tf.GradientTape() as tape:
            model.context = np.zeros((1, model.size_hidden), dtype=np.float32)
            for i in range(len(action_targets)):
                model.action = np.zeros((1, model.size_action), dtype=np.float32)
                #model.context += np.float32(np.random.normal(0., noise, size=(1, model.size_hidden)))
                observation = inputs[i].reshape(1, -1)
                model.feedforward(observation)

            # Get some statistics about what was correct and what wasn't
            tchoices = np.array(model.h_action_wta).reshape((-1, len(action_targets[0])))    # reshape to (x, 8)
            ratios = scripts.evaluate([tchoices], [action_targets])
            tpreds = np.array(model.h_prediction_wta).reshape((-1, len(prediction_targets[0])))
            ratios_predictions = scripts.evaluate([tpreds], [prediction_targets])

            # Train model, record loss. NOTE: targets and predictions are identical for this task!!!
            loss, gradients = model.train(tape, [action_targets, prediction_targets])

        # Monitor progress using rolling averages.
        speed = 2. / (episode + 2) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
        rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
        rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0], speed)
        rng_avg_preds = utils.rolling_avg(rng_avg_preds, ratios_predictions[0], speed)
        rng_avg_full_seq = utils.rolling_avg(rng_avg_full_seq, ratios[0] == 1, speed)  # whole action sequence correct ?
        # Display on the console at regular intervals
        if (episode < 1000 and episode in [3 ** n for n in range(50)]) or episode % 1000 == 0 \
                or episode + 1 == num_episodes:
            grad_avg = sum([np.sum(tf.reduce_sum(tf.abs(gradient)).numpy()) for gradient in gradients])/sum([tf.size(gradient).numpy() for gradient in gradients])
            grad_max = max([np.max(tf.reduce_max(tf.abs(gradient)).numpy()) for gradient in gradients])
            print("{0}: avg loss={1}, \tactions={2}, \tfull_seq={3}, \tpredictions={4}".format(
                    episode, rng_avg_loss, rng_avg_actions, rng_avg_full_seq, rng_avg_preds))

    return model
示例#4
0
def train_with_goals(model=None,
                     mse=False,
                     learning_rate=0.1,
                     noise=0.,
                     iterations=5000,
                     l2reg=0.0,
                     algorithm=nn.SGD,
                     hidden_units=15,
                     reg_strength=0.,
                     reg_increase="square"):
    num_goals = 2
    if model is None:
        model = nn.ElmanGoalNet(size_hidden=hidden_units,
                                algorithm=algorithm,
                                size_observation=len(all_inputs),
                                size_action=len(all_inputs),
                                size_goal1=num_goals,
                                size_goal2=0)
    num_episodes = iterations
    model.learning_rate = 0.5 if mse else learning_rate
    model.L2_regularization = l2reg

    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_goals = 0.

    for episode in range(num_episodes):
        decider = np.random.uniform()
        if decider < 0.6:
            seqid = 0
        elif decider < 0.8:
            seqid = 1
        else:
            seqid = 2

        sequence = seqs[seqid]
        goal = goals[seqid]
        inputs = utils.liststr_to_onehot(sequence[:-1], all_inputs)
        targets = utils.liststr_to_onehot(sequence[1:], all_outputs)
        targets_goal1 = goal
        model.action = np.zeros((1, model.size_action), dtype=np.float32)
        # run the network
        with tf.GradientTape() as tape:
            # Initialize context with random/uniform values.
            model.context = np.zeros((1, model.size_hidden), dtype=np.float32)
            model.goal1 = goal[0]
            for i in range(len(targets)):
                model.action = np.zeros((1, model.size_action),
                                        dtype=np.float32)
                model.context += np.float32(
                    np.random.normal(0., noise, size=(1, model.size_hidden)))
                observation = inputs[i].reshape(1, -1)
                model.feedforward(observation)

            # Get some statistics about what was correct and what wasn't
            tchoices = np.array(model.h_action_wta).reshape(
                (-1, len(targets[0])))
            ratios = scripts.evaluate([tchoices], [targets])

            cols = model.size_hidden
            # Regularization in the hidden layer weights
            # Recurrent hidden to hidden connections
            extra_loss = pnashierarchy.weight_regularization_calculator(
                model.hidden_layer.w, [0, model.size_hidden], [0, cols],
                reg_strength,
                reg_type="recurrent",
                reg_increase=reg_increase)
            # Prev action to hidden
            # extra_loss += weight_regularization_calculator(model.hidden_layer.w,
            #                                               [model.size_hidden+9, model.size_hidden+9+model.size_action],
            #                                               [0, cols],
            #                                               reg_strength, reg_type="input_right", reg_increase=reg_increase)
            # Prev goal to hidden
            extra_loss += pnashierarchy.weight_regularization_calculator(
                model.hidden_layer.w, [
                    model.size_hidden + 9 + model.size_action,
                    model.size_hidden + 9 + model.size_action + num_goals
                ], [0, cols],
                reg_strength,
                reg_type="input_left",
                reg_increase=reg_increase)

            # SWITCHED OUTPUT LEFT AND OUTPUT RIGHT.
            #Regularization in the output layers (goals and actions) weights
            # hidden to next action
            extra_loss += pnashierarchy.weight_regularization_calculator(
                model.action_layer.w, [0, model.size_hidden],
                [0, model.size_action],
                reg_strength,
                reg_type="output_right",
                reg_increase=reg_increase)
            # Hidden to next goal
            extra_loss += pnashierarchy.weight_regularization_calculator(
                model.goal1_layer.w, [0, model.size_hidden],
                [0, model.size_action],
                reg_strength,
                reg_type="output_left",
                reg_increase=reg_increase)

            # Regularization of the observation (only goes to the action side)
            #extra_loss += weight_regularization_calculator(model.hidden_layer.w,
            #                                                     [model.size_hidden, model.size_hidden+model.size_observation],
            #                                                     [0, cols],
            #                                                     reg_strength, reg_type="input_right", reg_increase=reg_increase)

            loss, _ = model.train_obsolete(targets, goal, None, tape,
                                           extra_loss)

            # Train model, record loss.
            #if mse:
            #    loss = model.train_MSE(targets, None, None, tape)
            #else:
            #    loss, gradients = model.train_obsolete(targets, targets_goal1, None, tape)
        # Monitor progress using rolling averages.
        speed = 2. / (
            episode + 2
        ) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
        rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
        rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0], speed)
        rng_avg_goals = utils.rolling_avg(
            rng_avg_goals, ratios[0] == 1,
            speed)  # whole action sequence correct ?
        # Display on the console at regular intervals
        if (episode < 1000 and episode in [3 ** n for n in range(50)]) or episode % 1000 == 0 \
                or episode + 1 == num_episodes:
            print(
                "{0}: avg loss={1}, \tactions={2}, \tfull_sequence={3}".format(
                    episode, rng_avg_loss, rng_avg_actions, rng_avg_goals))
    return model
示例#5
0
def train(model=None,
          mse=False,
          noise=0.,
          iterations=5000,
          l2reg=0.0,
          learning_rate=0.1,
          algorithm=nn.SGD,
          hidden_units=15):
    if model is None:
        model = nn.ElmanGoalNet(size_hidden=hidden_units,
                                algorithm=algorithm,
                                size_observation=len(all_inputs),
                                size_action=len(all_inputs),
                                size_goal1=0,
                                size_goal2=0)
    num_episodes = iterations
    model.learning_rate = learning_rate
    model.L2_regularization = l2reg

    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_full_seq = 0.

    for episode in range(num_episodes):
        seqid = utils.idx_from_probabilities(sequence_probabilities)
        sequence = seqs[seqid]
        inputs = utils.liststr_to_onehot(sequence[:-1], all_inputs)
        targets = utils.liststr_to_onehot(sequence[1:], all_outputs)
        model.action = np.zeros((1, model.size_action), dtype=np.float32)

        # run the network
        # Initialize context with random/uniform values.
        with tf.GradientTape() as tape:
            model.context = np.zeros((1, model.size_hidden), dtype=np.float32)
            for i in range(len(targets)):
                model.action = np.zeros((1, model.size_action),
                                        dtype=np.float32)
                model.context += np.float32(
                    np.random.normal(0., noise, size=(1, model.size_hidden)))
                observation = inputs[i].reshape(1, -1)
                model.feedforward(observation)

            # Get some statistics about what was correct and what wasn't
            tchoices = np.array(model.h_action_wta).reshape(
                (-1, len(targets[0])))
            ratios = scripts.evaluate([tchoices], [targets])

            # Train model, record loss.
            if mse:
                loss, gradients = model.train_MSE(targets, None, None, tape)
            else:
                loss, gradients = model.train_obsolete(targets, None, None,
                                                       tape)

        # Monitor progress using averages
        speed = 2. / (
            episode + 2
        ) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
        rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
        rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0], speed)
        rng_avg_full_seq = utils.rolling_avg(
            rng_avg_full_seq, ratios[0] == 1,
            speed)  # whole action sequence correct ?
        # Display on the console at regular intervals
        if (episode < 1000 and episode in [3 ** n for n in range(50)]) or episode % 1000 == 0 \
                or episode + 1 == num_episodes:
            grad_avg = sum([
                np.sum(tf.reduce_sum(tf.abs(gradient)).numpy())
                for gradient in gradients
            ]) / sum([tf.size(gradient).numpy() for gradient in gradients])
            grad_max = max([
                np.max(tf.reduce_max(tf.abs(gradient)).numpy())
                for gradient in gradients
            ])
            print(
                "{0}: avg loss={1}, \tactions={2}, \tfull_seq={3}, \tgrad_avg={4}, \tgrad_max={5}"
                .format(episode, rng_avg_loss, rng_avg_actions,
                        rng_avg_full_seq, grad_avg, grad_max))

    return model
示例#6
0
def train_supervised_teacoffeeenv(model, environment, num_episodes):
    env = environment

    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_goals = 0.
    rng_avg_goal1 = 0.
    rng_avg_goal2 = 0.
    rng_avg_action1 = 0.

    goal_list = [
        "g_1_make_coffee", "g_1_make_tea", "g_2_add_grounds", "g_2_add_cream",
        "g_2_add_sugar", "g_2_drink", "g_2_dip_teabag"
    ]
    for episode in range(num_episodes):
        goal = goal_list[episode % len(goal_list)]  # Cycle through the goals
        targets = tce.target_list[goal]  # Get the target actions and goals
        env.state = state.State(tce.TeaCoffeeData())  # Reinitialize the state
        env.state.current.set_field(goal, 1.)  # Set the goalS as active

        # run the network
        with tf.GradientTape() as tape:
            # Initialize context with random/uniform values.
            #model.context = np.zeros((1, model.size_hidden))
            model.context = np.float32(
                np.random.uniform(0.01, 0.99, (1, model.size_hidden)))
            targets_onehot = [[], [], []]  # actions, goal1s, goal2s
            for i, target in enumerate(targets):
                # Set up the input to be the correct actions and goals
                targets_onehot[0].append(
                    utils.str_to_onehot(targets[i][0],
                                        tce.TeaCoffeeData.actions_list))
                targets_onehot[1].append(
                    utils.str_to_onehot(targets[i][1],
                                        tce.TeaCoffeeData.goals1_list))
                targets_onehot[2].append(
                    utils.str_to_onehot(targets[i][2],
                                        tce.TeaCoffeeData.goals2_list))
                model.action, model.goal1, model.goal2 = [
                    targets_onehot[j][-1] for j in range(3)
                ]

                observation = env.observe()
                model.feedforward(observation)
                env.do_action(
                    target[0]
                )  # Transition the MDP according to the *target* action, not the chosen action!

            # Get some statistics about what was correct and what wasn't
            ratios = evaluate(
                [model.h_action_wta, model.h_goal1_wta, model.h_goal2_wta],
                targets_onehot)
            rng_avg_action1 = utils.rolling_avg(
                rng_avg_action1,
                ratio_correct([model.h_action_wta[0]], [targets_onehot[0][0]]),
                2. / (episode + 2) if episode < 1000 else 0.001)
            # Train model, record loss.
            loss = model.train_obsolete(targets_onehot[0], targets_onehot[1],
                                        targets_onehot[2], tape)

            # Monitor progress using rolling averages.
            speed = 2. / (
                episode + 2
            ) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
            rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
            rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0],
                                                speed)
            rng_avg_goals = utils.rolling_avg(
                rng_avg_goals, ratios[0] == 1,
                speed)  # whole action sequence correct ?
            rng_avg_goal1 = utils.rolling_avg(rng_avg_goal1, ratios[1], speed)
            rng_avg_goal2 = utils.rolling_avg(rng_avg_goal2, ratios[2], speed)
            # Display on the console at regular intervals
            if (episode < 1000 and episode in [3**n for n in range(50)]) or episode % 1000 == 0 \
                               or episode+1 == num_episodes:
                print(
                    "{0}: avg loss={1}, \tactions={2}, \tfull_sequence={3}\tgoal1={4}\tgoal2={5}\tfirst_action={6}"
                    .format(episode, rng_avg_loss, rng_avg_actions,
                            rng_avg_goals, rng_avg_goal1, rng_avg_goal2,
                            rng_avg_action1))
示例#7
0
def train_supervised(model, env, num_episodes):
    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_goals = 0.
    rng_avg_goal1 = 0.
    rng_avg_goal2 = 0.
    rng_avg_action1 = 0.

    for episode in range(num_episodes):
        # run the network
        with tf.GradientTape() as tape:
            # Initialize model with random/uniform values.
            model.context = np.zeros((1, model.size_hidden))

            # Initialize environment with a random sequence
            sequence = random.choice(env.sequences)
            sequence.initialize(
            )  # This is a bit weird: it's the environment that is getting initialized, not the sequence

            for target in sequence.targets:
                # Set up the input to be the correct actions and goals
                model.action = target.action_one_hot
                model.goal1 = target.goal1_one_hot if target.goal1_one_hot is not None else zeros
                model.goal1 = target.goal1_one_hot if target.goal1_one_hot is not None else zeros

                observation = env.observe()
                model.feedforward(observation)
                env.do_action(
                    target[0]
                )  # Transition the MDP according to the *target* action, not the chosen action!

            # Get some statistics about what was correct and what wasn't
            ratios = evaluate(
                [model.h_action_wta, model.h_goal1_wta, model.h_goal2_wta],
                targets_onehot)
            rng_avg_action1 = utils.rolling_avg(
                rng_avg_action1,
                ratio_correct([model.h_action_wta[0]], [targets_onehot[0][0]]),
                2. / (episode + 2) if episode < 1000 else 0.001)
            # Train model, record loss.
            loss = model.train_obsolete(targets_onehot[0], targets_onehot[1],
                                        targets_onehot[2], tape)

            # Monitor progress using rolling averages.
            speed = 2. / (
                episode + 2
            ) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
            rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
            rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0],
                                                speed)
            rng_avg_goals = utils.rolling_avg(
                rng_avg_goals, ratios[0] == 1,
                speed)  # whole action sequence correct ?
            rng_avg_goal1 = utils.rolling_avg(rng_avg_goal1, ratios[1], speed)
            rng_avg_goal2 = utils.rolling_avg(rng_avg_goal2, ratios[2], speed)
            # Display on the console at regular intervals
            if (episode < 1000 and episode in [3**n for n in range(50)]) or episode % 1000 == 0 \
                               or episode+1 == num_episodes:
                print(
                    "{0}: avg loss={1}, \tactions={2}, \tfull_sequence={3}\tgoal1={4}\tgoal2={5}\tfirst_action={6}"
                    .format(episode, rng_avg_loss, rng_avg_actions,
                            rng_avg_goals, rng_avg_goal1, rng_avg_goal2,
                            rng_avg_action1))
示例#8
0
def train_hierarchical_nogoals(noise=0,
                               iterations=10000,
                               learning_rate=0.1,
                               reg_strength=0.001,
                               reg_increase="linear"):
    model = nn.ElmanGoalNet(size_hidden=15,
                            size_observation=9,
                            size_action=8,
                            size_goal1=0,
                            size_goal2=0)
    num_episodes = iterations
    model.learning_rate = learning_rate
    model.L2_regularization = 0.

    rng_avg_loss = 0.
    rng_avg_actions = 0.
    rng_avg_goals = 0.

    for episode in range(num_episodes):
        model.new_episode()
        seqid = utils.idx_from_probabilities(
            pnas2018task.sequence_probabilities)

        #goal = pnas2018task.goals[seqid]
        sequence = pnas2018task.seqs[seqid]
        inputs = utils.liststr_to_onehot(sequence[:-1],
                                         pnas2018task.all_inputs)
        targets = utils.liststr_to_onehot(sequence[1:],
                                          pnas2018task.all_outputs)
        model.action = np.zeros((1, model.size_action), dtype=np.float32)
        # run the network
        with tf.GradientTape() as tape:
            # Initialize context with random/uniform values.
            #model.context = np.zeros((1, model.size_hidden), dtype=np.float32)
            #model.goal1 = np.zeros_like(goal[0])
            for i in range(len(targets)):
                #model.action = np.zeros((1, model.size_action), dtype=np.float32)
                # Add noise
                model.context += np.float32(
                    np.random.normal(0., noise, size=(1, model.size_hidden)))
                observation = inputs[i].reshape(1, -1)
                model.feedforward(observation)

            # Get some statistics about what was correct and what wasn't
            tchoices = np.array(model.h_action_wta).reshape(
                (-1, len(targets[0])))
            ratios = scripts.evaluate([tchoices], [targets])
            # Train model, record loss.
            cols = model.size_hidden
            # Regularization in the hidden layer weights
            # Recurrent hidden to hidden connections
            extra_loss = utils.weight_regularization_calculator(
                model.hidden_layer.w, [0, model.size_hidden], [0, cols],
                reg_strength,
                reg_type="recurrent",
                reg_increase=reg_increase)
            # Prev action to hidden
            #extra_loss += weight_regularization_calculator(model.hidden_layer.w,
            #                                                     [model.size_hidden+9, model.size_hidden+9+model.size_action], [0, cols],
            #                                                     reg_strength, reg_type="input_right", reg_increase=reg_increase)
            # Prev goal to hidden
            #extra_loss += weight_regularization_calculator(model.hidden_layer.w,
            #                                                     [model.size_hidden+9+model.size_action, model.size_hidden+9+model.size_action+2], [0, cols],
            #                                                     reg_strength, reg_type="input_left", reg_increase=reg_increase)

            #Regularization in the output layers (goals and actions) weights
            # hidden to next action
            extra_loss += utils.weight_regularization_calculator(
                model.action_layer.w, [0, model.size_hidden],
                [0, model.size_action],
                reg_strength,
                reg_type="output_right",
                reg_increase=reg_increase)

            # Hidden to next goal
            #extra_loss += weight_regularization_calculator(model.goal1_layer.w,
            #                                                    [0, model.size_hidden], [0, model.size_action],
            #                                                     reg_strength, reg_type="output_left", reg_increase=reg_increase)

            # Regularization of the observation (only goes to the action side)
            #extra_loss += weight_regularization_calculator(model.hidden_layer.w,
            #                                                     [model.size_hidden, model.size_hidden+model.size_observation],
            #                                                     [0, cols],
            #                                                     reg_strength, reg_type="input_right", reg_increase=reg_increase)

            loss, _ = model.train_obsolete(targets, None, None, tape,
                                           extra_loss)
            #if(episode%100 == 0):
            #    print(loss.numpy()-extra_loss.numpy(), extra_loss.numpy())
        # Monitor progress using rolling averages.
        speed = 2. / (
            episode + 2
        ) if episode < 1000 else 0.001  # enables more useful evaluations for early trials
        rng_avg_loss = utils.rolling_avg(rng_avg_loss, loss, speed)
        rng_avg_actions = utils.rolling_avg(rng_avg_actions, ratios[0], speed)
        rng_avg_goals = utils.rolling_avg(
            rng_avg_goals, ratios[0] == 1,
            speed)  # whole action sequence correct ?
        # Display on the console at regular intervals
        if (episode < 1000 and episode in [3 ** n for n in range(50)]) or episode % 1000 == 0 \
                or episode + 1 == num_episodes:
            print(
                "{0}: avg loss={1}, \tactions={2}, \tfull_sequence={3}".format(
                    episode, rng_avg_loss, rng_avg_actions, rng_avg_goals))
    return model