Пример #1
0
                    help='Save the experiement at the end')
    args = parser.parse_args()
    try:
        N = int(args.training)
    except ValueError:
        if args.training == 'full':
            N = 60000
        else:
            print 'Couldn\'t cast the number of training. Used 10000'
            N = 10000
    max_epochs = 15
    max_epochs_ft = args.epochs
    if args.debug:
        N = N / 10
        max_epochs = 10
        max_epochs_ft = max(10, max_epochs_ft)

    model = Experience(N, name=args.name, disp=args.img, 
                          noise=args.noise)
    if not model.exists:
        model.pretrain(epochs=max_epochs, lr=0.1)
        model.save()
    print 'Pretraining done\n'

    model.fine_tune(epochs=max_epochs_ft, lr=0.05,
                    dropout=args.dropout, lcost=args.lcost)
    
    model.eval_perf()

    model.save()
Пример #2
0
    def run_episode(self):
        #print('reseting \n')
        self.env.reset()
        done = False
        experiences = []
        time_count = 0

        while (not done) and time_count <= Config.TIME_MAX:

            # very first few frames
            if len(self.env.current_state) == 0:
                reward, done, p_mask, region_now = self.env.step(
                    np.random.randint(0, 4, 1))
                #print('reset \n')
                #self.env.step(5)  #0 == NOOP
                if Config.PLAY_MODE and done == False:
                    draw_gif_sequences_test(time_count,
                                            region_now,
                                            self.env.img_name,
                                            save_boolean=1)
                elif Config.PLAY_MODE and done == True:
                    drawing_gif(self.env.img_name)

                time_count += 1

                continue

            #print('shape: ' + str(self.env.current_state) + '\n')
            prediction, value = self.predict(self.env.current_state)
            action = self.select_action(prediction)

            if time_count < Config.TIME_MAX:
                #action = 5
                reward, done, p_mask, region_now = self.env.step(action)
                exp = Experience(self.env.previous_state, action, reward,
                                 reward, done, p_mask)
                experiences.append(exp)

            else:
                reward, done, p_mask, region_now = self.env.step(4)
                exp = Experience(self.env.previous_state, 4, reward, reward,
                                 done, p_mask)
                experiences.append(exp)

            if Config.PLAY_MODE and done == False:
                draw_gif_sequences_test(time_count,
                                        region_now,
                                        self.env.img_name,
                                        save_boolean=1)
            elif Config.PLAY_MODE and done == True:
                drawing_gif(self.env.img_name)

            if done:
                terminal_reward = 0 if done else value
                updated_exps = ProcessAgent._accumulate_rewards(
                    experiences, self.discount_factor, terminal_reward)
                x_, r_, r0_, a_, p_mask_ = self.convert_data(updated_exps)
                #print('time: ' + str(time_count) + ', done: ' + str(done) + ', p_mask: ' + str(p_mask) + ', reward: ' + str(reward_all) + ', action:' + str(action) + ' ,a_:' + str(a_.shape) + '\n')
                # keep the last experience for the next batch
                experiences = []
                yield x_, r_, r0_, a_, p_mask_, self.env.img_name

            time_count += 1
Пример #3
0
        for x in range(LEARNING_SAMPLE_SIZE):
            picked.append(self.storage[random.randint(0, self.index - 1)])
        return picked

    def print_storage(self, range_lower=None, range_upper=None):
        # For testing purposes
        if range_lower == None and range_upper == None:
            print(self.storage)
        else:
            print(self.storage[range_lower:range_upper])


if __name__ == "__main__":
    # Perform tests
    ds = ReplayMemory()
    for x in range(0, 3):
        y = Experience(x, x, x, x)
        ds.store(y)
    ds.print_storage(0, 4)
    print(ds.get_random(32))

# Numpy array results, n = 30,000,000:
# real	0m13.407s
# user	0m13.078s
# sys	0m0.279s

# Python list results, n = 30,000,000:
# real	0m15.228s
# user	0m14.727s
# sys	0m0.499s
Пример #4
0
    def run_episode(self):
        # Initialize
        self.env.reset()
        game_over   = False
        experiences = [[] for i in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)]
        updated_exps = [None for i in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)]
        updated_leftover_exps = [None for i in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT)]
        time_counts  = np.zeros((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT))
        reward_sum_logger  = np.zeros((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT))
        which_agents_done_and_trained  = np.full((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT), False, dtype=bool)

        while not game_over:
            # Initial step
            # if self.env.current_state is None:
            #     if Config.DEBUG: print('[ DEBUG ] ProcessAgent::Initial step')
            #     self.env.step(-1, self.pid, self.count)# Action 0 corresponds to null action
            #     # self.count += 1
            #     continue

            actions = {}
            predictions = np.empty((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT,Config.NUM_ACTIONS))
            values = np.empty((Config.MAX_NUM_AGENTS_IN_ENVIRONMENT))

            for i, agent_observation in enumerate(self.env.latest_observations):
                # print("Agent: {}. Obs: {}".format(i, agent_observation))
                is_agent_running_ga3c = agent_observation[0]
                # print("is_agent_running_ga3c: {}".format(is_agent_running_ga3c))
                if not is_agent_running_ga3c:
                    continue

                # Prediction
                # print("[ProcessAgent]", "i:", i, "agent_observation:", agent_observation)
                prediction, value = self.predict(agent_observation)

                # Select action
                action = self.select_action(prediction)

                predictions[i] = prediction
                values[i] = value
                actions[i] = action

                # print("action", actions[i])
            # print("actions:", actions)
            # Take action --> Receive reward, done (and also store self.env.previous_state for access below)
            rewards, game_over, infos = self.env.step([actions], self.pid, self.count)

            rewards = rewards[0] # Only use 1 env from VecEnv
            if Config.TRAIN_SINGLE_AGENT:
                rewards = np.expand_dims(rewards, axis=0) # Make the single agent's reward look like a list of agents' rewards

            which_agents_done = infos[0]['which_agents_done']
            which_agents_learning = infos[0]['which_agents_learning']
            num_agents_running_ga3c = np.sum(list(which_agents_learning.values()))

            # print("which_agents_done: {}".format(which_agents_done))
            # print("which_agents_learning: {}".format(which_agents_learning))

            for i in which_agents_learning.keys():
                # Loop through all feedback from environment (which may not be equal to Config.MAX_NUM_AGENTS)
                if not which_agents_learning[i]:
                    continue

                # Reward
                reward_sum_logger[i] += rewards[i]

                prediction = predictions[i]
                value = values[i]
                action = actions[i]
                reward = rewards[i]
                done = which_agents_done[i]
                # Add to experience

                exp = Experience(self.env.previous_state[0,i,:],
                                 action, prediction, reward, done)

                experiences[i].append(exp)

                # If episode is done
                # Config.TIME_MAX controls how often data is yielded/sent back to the for loop in the run(). 
                # It is used to ensure, for games w long episodes, that data is sent back to the trainers sufficiently often
                # The shorter Config.TIME_MAX is, the more often the data queue is updated 
                if which_agents_done[i] or time_counts[i] == Config.TIME_MAX and which_agents_done_and_trained[i] == False:
                    if which_agents_done[i]:
                        terminal_reward = 0
                        which_agents_done_and_trained[i] = True
                    else:
                        terminal_reward = value
                    updated_exps[i], updated_leftover_exps[i] = self._accumulate_rewards(experiences[i], self.discount_factor, terminal_reward, which_agents_done[i])

                    x_, r_, a_ = self.convert_to_nparray(updated_exps[i])# NOTE if Config::USE_AUDIO == False, audio_ is None
                    yield x_, r_, a_, reward_sum_logger[i] / num_agents_running_ga3c # sends back data without quitting the current fcn

                    reward_sum_logger[i] = 0.0 # NOTE total_reward_logger in self.run() accumulates reward_sum_logger, so it is correct to reset it here 

                    if updated_leftover_exps[i] is not None:
                        #  terminal_reward = 0
                        x_, r_, a_ = self.convert_to_nparray(updated_leftover_exps[i]) # NOTE if Config::USE_AUDIO == False, audio_ is None
                        yield x_, r_, a_, reward_sum_logger[i] # TODO minor figure out what to send back in terms of rnn_state. Technically should be rnn_state[-1].

                    # Reset the tmax count
                    time_counts[i] = 0

                    # Keep the last experience for the next batch
                    experiences[i] = [experiences[i][-1]]

                time_counts[i] += 1
            self.count += 1
Пример #5
0
    def run_episode(self):
        self.env.reset()
        game_done = False
        experiences = []
        time_count = 0
        frame_count = 0
        reward_sum_logger = 0.0

        if Config.USE_OPTIONS:
            self.option_terminated = True

        if Config.USE_RNN:
            # input states for prediction
            rnn_state = CustomLayers.RNNInputStateHandler.get_rnn_dict(
                init_with_zeros=True,
                n_lstm_layers_total=self.model.n_lstm_layers_total)

            # input states for training
            init_rnn_state = CustomLayers.RNNInputStateHandler.get_rnn_dict(
                init_with_zeros=True,
                n_lstm_layers_total=self.model.n_lstm_layers_total)
        else:
            rnn_state = None
            init_rnn_state = None

        if self.id == 0 and self.is_option_tracker_on:
            self.option_tracker._reset_tracker(vis_count)

        while not game_done:
            # Initial step (used to ensure frame_q is full before trying to grab a current_state for prediction)
            if Config.USE_AUDIO and (self.env.current_state[0] is None
                                     and self.env.current_state[1] is None):
                self.env.step(0)  # Action 0 corresponds to null action
                continue
            elif self.env.current_state is None:
                self.env.step(0)  # Action 0 corresponds to null action
                continue

            if self.is_option_tracker_on:
                agt_loc = self.env.game.agent_loc

            # Option prediction
            if Config.USE_OPTIONS:
                if self.option_terminated:
                    i_option = 0  # NOTE Fake option input
                    prediction_dict = self.predict(self.env.current_state,
                                                   rnn_state, i_option)
                    i_option = self.select_option(
                        prediction_dict
                    )  # NOTE Select option correctly in here
            else:
                i_option = None

            # Primitive action prediction (for option and non-option cases)
            if self.id == 0:
                print("frame_count {}, i_option: {}".format(
                    frame_count, i_option))
            prediction_dict = self.predict(self.env.current_state, rnn_state,
                                           i_option)

            # Update rnn_state
            if Config.USE_RNN:
                rnn_state = prediction_dict['rnn_state_out']

            # Visualize train process or test process
            if self.id == 0:
                if Config.USE_ATTENTION:
                    self.vis_attention_i.append(prediction_dict['attn'][0])
                    self.vis_attention_a.append(prediction_dict['attn'][1])
                else:
                    self.vis_attention_i = None
                    self.vis_attention_a = None

                self.env.visualize_env(self.vis_attention_i,
                                       self.vis_attention_a, vis_count)

            # Select action
            i_action = self.select_action(prediction_dict)

            # Take action --> Receive reward, game_done (and also store self.env.previous_state for access below)
            reward, game_done = self.env.step(i_action)
            reward = np.clip(reward, Config.REWARD_MIN, Config.REWARD_MAX)

            if Config.USE_OPTIONS:
                reward -= float(
                    self.option_terminated
                ) * self.model.option_cost_delib * float(frame_count > 1)
                self.option_terminated = prediction_dict['option_term_probs'][
                    i_option] > np.random.rand()
            reward_sum_logger += reward

            # Add to experience
            if Config.USE_AUDIO:
                exp = Experience(self.env.previous_state[0],
                                 self.env.previous_state[1], i_action,
                                 i_option, reward, game_done)
            else:
                exp = Experience(self.env.previous_state, None, i_action,
                                 i_option, reward, game_done)
            experiences.append(exp)

            # Plot option trajectories
            if self.id == 0 and self.is_option_tracker_on:
                self.option_tracker._update_tracker(agt_loc, i_option,
                                                    self.option_terminated)
                self.option_tracker._plot_tracker()

            # Config.TIME_MAX controls how often data is yielded/sent back to the for loop in the run().
            # It is used to ensure, for games w long episodes, that data is sent back to the trainers sufficiently often
            # The shorter Config.TIME_MAX is, the more often the data queue is updated
            if game_done or time_count == Config.TIME_MAX:  # or self.option_terminated:
                if Config.USE_OPTIONS:
                    if self.option_terminated:
                        value = prediction_dict[
                            'option_v_model'] - self.model.option_cost_delib * float(
                                frame_count > 1)
                    else:
                        value = prediction_dict['option_q_model'][i_option]
                    terminal_reward = 0 if game_done else value
                else:
                    terminal_reward = 0 if game_done else prediction_dict[
                        'v']  # Ref: A3C Algorithm S2 (n-step q-learning)

                updated_exps, updated_leftover_exp = ProcessAgent._accumulate_rewards(
                    experiences, self.discount_factor, terminal_reward,
                    game_done)
                x_, audio_, r_, a_, o_ = self.convert_to_nparray(updated_exps)
                yield x_, audio_, r_, a_, o_, init_rnn_state, reward_sum_logger

                reward_sum_logger = 0.0  # NOTE total_reward_logger in self.run() accumulates reward_sum_logger so reset here

                if updated_leftover_exp is not None:
                    x_, audio_, r_, a_, o_ = self.convert_to_nparray(
                        updated_leftover_exp)
                    yield x_, audio_, r_, a_, o_, init_rnn_state, reward_sum_logger

                # Reset the tmax count
                time_count = 0

                # Keep the last experience for the next batch
                experiences = [experiences[-1]]

                if Config.USE_RNN:
                    init_rnn_state = rnn_state

            time_count += 1
            frame_count += 1
 def logArrival(self, simtime, stageId):
     self._experience.update({stageId: Experience(stageId, simtime)})
     self._currLocation = stageId
     self._totalWaitTime = math.nan
     self._totalSystemTime = math.nan