示例#1
0
def main():
    epsilon = 1
    env = MetaEnvMulti()  # TODO
    EPISODES = args['episodes']
    a = str(datetime.now()).split('.')[0]

    MetaAgent = DQNAgent(state_size=META_STATE_SIZE,
                         action_size=META_OPTION_SIZE,
                         hiddenLayers=[75],
                         dropout=args['dropout'],
                         activation='relu',
                         loadname=None,
                         saveIn=False,
                         learningRate=args['learning_rate'],
                         discountFactor=args['discount_factor'])

    filename = args['save_folder']
    if 'meta-weights' not in args.keys():
        filename = "{}{}_Meta_HiddenLayers_{}_Dropout_{}_LearningRate_{}_Gamma_{}_Activation_{}_Episode_{}_single_nn_policy{}.h5".format(
            filename, a, str(MetaAgent.hiddenLayers), str(MetaAgent.dropout),
            str(MetaAgent.learning_rate), str(MetaAgent.gamma),
            MetaAgent.activation, str(EPISODES), args['note_file'])
    else:
        filename = filename + args['meta_weights']

    # See if the user has given the hidden layer configuration for option_agnet
    nodes_hidden = [75]  # defaulr value
    if 'controller_hidden' in args.keys():
        controller_hidden_config = args['controller_hidden']
        # extract the array from same
        nodes_hidden = [
            int(node) for node in controller_hidden_config.split('_')
        ]
    # load the agents for the controller , which is single for this case

    option_agent: DQNAgent = DQNAgent(
        state_size=CONTROLLER_STATE_SIZE,
        action_size=CONTROLLER_ACTION_SIZE,
        hiddenLayers=nodes_hidden,
        dropout=0.000,
        activation='relu',
        loadname=None,
        saveIn=False,
        learningRate=0.05,
        discountFactor=0.7,
        epsilon=0.0)  # Not mkaing an agent for the user based actions
    option_agent.load(args['controller_weights']
                      )  # Load the weight for al the controller policies

    visits = np.zeros([META_OPTION_SIZE
                       ])  # Store the number of Visits of each intentn tyope
    batch_size = 64
    track = []
    i = 0
    no_controller_breaks = 0
    config_file = '{}{}.txt'.format(
        args['config_folder'], a
    )  # this is the configualtion older containt all the details of the experimernt along with the files names

    with open(config_file, 'w') as fil:
        fil.write(str(args))
        fil.write('\n')
        fil.write("meta_policy_file : {}".format(filename))

    for episode in range(EPISODES):  # Episode
        running_meta_reward = 0
        [confidence_state, intent_state] = env.reset()  #
        done = False  # Running the meta polciy
        while not done:  # Meta Policy Epsiode Loop
            # print("Round Meta : {}".format(episode)) # Probably not requried

            all_options = env.constrain_options()
            state = np.concatenate([confidence_state, intent_state])

            state = state.reshape([1, META_STATE_SIZE
                                   ])  # Converted to appropritate size
            meta_start_state = state.copy()

            option = MetaAgent.act(state, all_options, epsilon=epsilon)
            next_confidence_state = env.meta_step_start(
                option)  # get the reward at the sub policy level
            meta_reward = 0
            print("The state : {}\nThe option : {}".format(
                meta_start_state, option))
            if option == 5:  # the user agent option:
                pass
            else:
                #############################################################
                # HERE COMES THE PART FOR CONTROLLER EXECUTION
                option_completed = False
                # make a one hot goal vector
                goal_vector = utils.one_hot(option, NO_INTENTS)
                i_ = 0
                controller_state = np.concatenate(
                    [next_confidence_state, goal_vector])
                controller_state = controller_state.reshape(
                    1, CONTROLLER_STATE_SIZE)
                while not option_completed:
                    opt_actions = range(
                        CONTROLLER_ACTION_SIZE
                    )  # Currently it is the while possible actions size
                    action = option_agent.act(
                        controller_state, all_act=opt_actions,
                        epsilon=0)  # provide episone for greedy approach
                    next_confidence_state, _, option_completed = env.controller_step(
                        option, action)
                    next_controller_state = np.concatenate(
                        [next_confidence_state, goal_vector])
                    next_controller_state = np.reshape(
                        next_controller_state, [1, CONTROLLER_STATE_SIZE])
                    # we dont need to store the experience replay memory for the controller policy
                    controller_state = next_controller_state
                    i_ += 1
                    if i_ > args['break_controller_loop']:
                        no_controller_breaks += 1
                        break

                ###############################################

            confidence_state, next_confidence_state, intent_state, meta_reward, done = env.meta_step_end2(
                option)

            meta_end_state = np.concatenate(
                [next_confidence_state, intent_state])
            meta_end_state = meta_end_state.reshape([1, META_STATE_SIZE])
            epsilon = MetaAgent.observe(
                (meta_start_state, option, meta_reward, meta_end_state, done),
                epsilon=epsilon)
            print("The next meta state : {}\n The reward : {}\nEpsilon : {}".
                  format(meta_end_state, meta_reward, epsilon))
            if MetaAgent.memory.tree.total() > batch_size:
                MetaAgent.replay()
                MetaAgent.rem_rew(meta_reward)
            i += 1
            running_meta_reward = running_meta_reward + meta_reward
            if i % 100 == 0:  # calculating different variables to be outputted after every 100 time steps
                avr_rew = MetaAgent.avg_rew()
                track.append([
                    str(i) + " " + str(avr_rew) + " " + str(episode) + " " +
                    str(epsilon)
                ])
                with open("results_" + a + "_.txt", 'w') as fi:
                    for j in range(0, len(track)):
                        line = track[j]
                        fi.write(str(line).strip("[]''") + "\n")
            # print(track)
            if done:
                print(
                    "episode: {}/{}, score: {}, e's: {}\nNumber of Controller breaks : {}"
                    .format(episode, EPISODES, running_meta_reward, epsilon,
                            no_controller_breaks))

                print("The state is : ", meta_end_state)
                break

            confidence_state = next_confidence_state

        if episode % 200 == 0:
            print("Episodes : {}".format(episode))
            # Saving the progress
            print("Saving")
            # convert this to save model for each policy
            MetaAgent.save(filename)
            # agent.saveController(fileController)
            sleep(0.2)
            print("Done Saving You can Now Quit")
            sleep(1)
def main():

    epsilon = args['epsilon']
    env = FlatEnv()
    EPISODES = args['episodes']
    a = str(datetime.now()).split('.')[0]
    hidden_layers = [int(i) for i in args['hidden_layers'].split('_')]
    Agent = DQNAgent(state_size=STATES_SIZE,
                     action_size=ACTION_SIZE,
                     hiddenLayers=hidden_layers,
                     dropout=args['dropout'],
                     activation='relu',
                     loadname=None,
                     saveIn=False,
                     learningRate=args['learning_rate'],
                     discountFactor=args['discount_factor'])
    filename = args['save_folder']

    filename = "{}{}_Flat_HiddenLayers_{}_Dropout_{}_LearningRate_{}_Gamma_{}_Activation_{}_Episode_{}_Flat_rl_policy_{}.h5".format(
        filename, a, str(Agent.hiddenLayers), str(Agent.dropout),
        str(Agent.learning_rate), str(Agent.gamma), Agent.activation,
        str(EPISODES), args['note_file'])

    batch_size = args['batch_size']
    track = []
    i = 0
    config_file = '{}{}.txt'.format(
        args['config_folder'], a
    )  # this is the configualtion older containt all the details of the experimernt along with the files names

    with open(config_file, 'w') as fil:
        fil.write(str(args))
        fil.write('\n')
        fil.write("Flat Policy File : {}".format(filename))

    for episode in range(EPISODES):  # Episode
        running_reward = 0
        [confidence_state, intent_state] = env.reset()  #
        done = False  #
        while not done:  #
            #

            state = np.concatenate([confidence_state, intent_state])

            state = state.reshape([1, STATES_SIZE
                                   ])  # Converted to appropritate size
            bcolors.printblue("The State : {}".format(state))
            intent_set_completed = False  # over her the option will mean the consolidated intent sapce of the iteration
            i_ = 0
            while not intent_set_completed:
                all_actions = env.constrain_actions()
                action = Agent.act(
                    state, all_act=all_actions,
                    epsilon=epsilon)  # provide episone for greedy approach
                confidence_state, intent_state, reward, intent_set_completed, done = env.step(
                    action)  # the step will be  nromal step
                next_state = np.concatenate([confidence_state, intent_state])
                next_state = np.reshape(next_state, [1, STATES_SIZE])
                epsilon = Agent.observe((next_state, action, reward,
                                         next_state, intent_set_completed),
                                        epsilon=epsilon)
                if Agent.memory.tree.total() > batch_size:
                    Agent.replay()
                Agent.rem_rew(reward)
                running_reward += reward
                if i % 100 == 0:
                    avr_rew = Agent.avg_rew()
                    track.append([
                        str(i) + " " + str(avr_rew) + " " + str(episode) +
                        " " + str(epsilon)
                    ])
                    with open("results_" + a + "_.txt", 'w') as fi:
                        for j in range(0, len(track)):
                            line = track[j]
                            fi.write(str(line).strip("[]''") + "\n")
                # print(track)
                if intent_set_completed:
                    print("Moving to the next set of itnent : {}".format(
                        intent_state))
                    break
                state = next_state
            ##############################################
            if done:
                bcolors.printgreen(
                    "episode: {}/{}, Reward: {}, e's: {}".format(
                        episode,
                        EPISODES,
                        running_reward,
                        epsilon,
                    ))

                print("The state is : ", state)
                break

        if episode % 200 == 0:
            print("Episodes : {}".format(episode))
            # Saving the progress
            print("Saving")
            # convert this to save model for each policy
            Agent.save(filename)
            print("Done Saving You can Now Quit")
            sleep(0.5)