act_input = tf.keras.Input((num_actions, ), name='actions_input') if MODEL == 'DRL': # Create an environment ENV = Environment(input_train, y_train) # Actor-Critic # Train critic_model = Network(num_actions, embeddings_matrix, maxlen=maxlen) _ = critic_model(inputs=[inputs, act_input]) critic_model.compile(loss='mse', optimizer='adam') # Decision actor_q_model = tf.keras.Model( inputs=critic_model.input, outputs=critic_model.get_layer('q_outputs').output) if MODE == 'train': def train(samples): if len(samples) < BATCH_SIZE: return samples = np.array(samples) states, actions, old_q, rewards, next_states = zip(*samples) states, actions, old_q, rewards = np.array(states), np.array(actions).reshape(-1, 1),\ np.array(old_q).reshape(-1, 1), np.array(rewards).reshape(-1, 1) actions_one_hot = tf.keras.utils.to_categorical( actions, num_actions) q_estimate = (1 - ALPHA) * old_q + ALPHA * rewards.reshape(