def main(): # try: parse_cmd_args() sess = tf.Session() K.set_session(sess) db = Database() env = Environment(db, argus) actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'], size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem']) num_trials = argus['num_trial'] # ? # trial_len = 500 # ? # ntp env.preheat() # First iteration cur_state = env._get_obs() # np.array (inner_metric + sql) cur_state = cur_state.reshape((1, env.state.shape[0])) # action = env.action_space.sample() action = env.fetch_action() # np.array action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape-") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state for i in range(num_trials): # env.render() cur_state = cur_state.reshape((1, env.state.shape[0])) action, isPredicted = actor_critic.act(cur_state) print(action) action_2 = action.reshape((1, env.action_space.shape[0])) # for memory # action.tolist() # to execute new_state, reward, done, _ = env.step(action, isPredicted, i + 1) new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("%d-shape-" % i) print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() cur_state = new_state '''
sess = tf.Session() K.set_session(sess) db = Database(argus) # connector knobs metric env = Environment(db, argus) actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'], size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem']) num_trials = argus['num_trial'] # ? # trial_len = 500 # ? # ntp env.preheat() # First iteration cur_state = env._get_obs() # np.array (inner_metric + sql) cur_state = cur_state.reshape((1, env.state.shape[0])) # action = env.action_space.sample() action = env.fetch_action() # np.array action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, socre, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state predicted_rewardList = []