예제 #1
0
    act_input = tf.keras.Input((num_actions, ), name='actions_input')
    if MODEL == 'DRL':
        # Create an environment
        ENV = Environment(input_train, y_train)

        # Actor-Critic

        # Train
        critic_model = Network(num_actions, embeddings_matrix, maxlen=maxlen)
        _ = critic_model(inputs=[inputs, act_input])
        critic_model.compile(loss='mse', optimizer='adam')

        # Decision
        actor_q_model = tf.keras.Model(
            inputs=critic_model.input,
            outputs=critic_model.get_layer('q_outputs').output)

        if MODE == 'train':

            def train(samples):

                if len(samples) < BATCH_SIZE:
                    return
                samples = np.array(samples)
                states, actions, old_q, rewards, next_states = zip(*samples)
                states, actions, old_q, rewards = np.array(states), np.array(actions).reshape(-1, 1),\
                                                  np.array(old_q).reshape(-1, 1), np.array(rewards).reshape(-1, 1)
                actions_one_hot = tf.keras.utils.to_categorical(
                    actions, num_actions)

                q_estimate = (1 - ALPHA) * old_q + ALPHA * rewards.reshape(