示例#1
0
        def init_variables(self, info):
            # Here you have the information of the game (virtual init() in random_walk.cpp)
            # List: game_time, goal, number_of_robots, penalty_area, codewords,
            #       robot_height, robot_radius, max_linear_velocity, field, team_info,
            #       {rating, name}, axle_length, resolution, ball_radius
            # self.game_time = info['game_time']
            self.field = info['field']
            self.robot_size = 2 * info['robot_radius']
            self.goal = info['goal']
            self.max_linear_velocity = info['max_linear_velocity']
            self.number_of_robots = info['number_of_robots']
            self.end_of_frame = False
            self.cur_my = []
            self.cur_ball = []

            self.state_dim = 2  # relative ball
            self.history_size = 2  # frame history size
            self.action_dim = 2  # 2

            self.arglist = Argument()
            self.state_shape = (self.state_dim * self.history_size,
                                )  # state dimension
            self.act_space = [Discrete(self.action_dim * 2 + 1)]
            self.trainers = MADDPGAgentTrainer('agent_moving',
                                               self.mlp_model,
                                               self.state_shape,
                                               self.act_space,
                                               0,
                                               self.arglist,
                                               local_q_func=False)

            # for tensorboard
            self.summary_placeholders, self.update_ops, self.summary_op = \
                                                            self.setup_summary()
            self.summary_writer = \
                tf.summary.FileWriter('summary/moving_test', U.get_session().graph)

            U.initialize()

            # Load previous results, if necessary
            if self.arglist.load_dir == "":
                self.arglist.load_dir = self.arglist.save_dir
            if self.arglist.restore:
                print('Loading previous state... %s' % self.arglist.load_dir)
                U.load_state(self.arglist.load_dir)

            self.saver = tf.train.Saver(max_to_keep=1100)

            self.state = np.zeros([self.state_dim * self.history_size
                                   ])  # histories
            self.train_step = 216000
            self.wheels = np.zeros(self.number_of_robots * 2)
            self.action = np.zeros(self.action_dim * 2 + 1)  # not np.zeros(2)

            self.stats_steps = 6000  # for tensorboard
            self.rwd_sum = 0

            self.done = False
            self.control_idx = 0
            return
示例#2
0
        def init_variables(self, info):
            # Here you have the information of the game (virtual init() in random_walk.cpp)
            # List: game_time, goal, number_of_robots, penalty_area, codewords,
            #       robot_height, robot_radius, max_linear_velocity, field, team_info,
            #       {rating, name}, axle_length, resolution, ball_radius
            # self.game_time = info['game_time']
            self.field = info['field']
            self.robot_size = 2*info['robot_radius']
            self.goal = info['goal']
            self.max_linear_velocity = info['max_linear_velocity']
            self.number_of_robots = info['number_of_robots']
            self.end_of_frame = False
            self.cur_my_posture = []
            self.cur_op_posture = []
            self.cur_ball = []
            self.pre_ball = [0, 0]

            self.state_dim = 2 # 3*my robots, relative to the ball position
            self.history_size = 2 # frame history size
            self.action_dim = 2 # 2                    
            
            self.arglist = Argument()
            self.obs_shape_n = [(self.state_dim * self.history_size,) for _ in range(1)] # state dimenstion
            self.action_space = [spaces.Discrete(self.action_dim * 2 + 1) for _ in range(1)]
            self.trainers = self.get_trainers(1, self.obs_shape_n, self.action_space, self.arglist)

            # for tensorboard
            self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
            self.summary_writer = tf.summary.FileWriter('summary/aiwc_maddpg', U.get_session().graph)

            U.initialize()
            
            # Load previous results, if necessary
            if self.arglist.load_dir == "":
                self.arglist.load_dir = self.arglist.save_dir
            if self.arglist.display or self.arglist.restore or self.arglist.benchmark:
                print('Loading previous state...')
                U.load_state(self.arglist.load_dir)

            self.final_ep_rewards = []  # sum of rewards for training curve
            self.final_ep_ag_rewards = []  # agent rewards for training curve
            self.agent_info = [[[]]]  # placeholder for benchmarking info
            self.saver = tf.train.Saver()
            self.obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(1)] # histories
            self.train_step = 0
            self.wheels = np.zeros(self.number_of_robots*2)
            self.action_n = [np.zeros(self.action_dim * 2 + 1) for _ in range(1)]
                   
            self.save_every_steps = 12000 # save the model every 10 minutes
            self.stats_steps = 6000 # for tensorboard
            self.reward_sum = 0
            self.score_sum = 0 
            self.active_flag = [[False for _ in range(5)], [False for _ in range(5)]]   
            self.inner_step = 0

            self.done = False
            self.control_idx = 0
            return
示例#3
0
def train(arglist):
    with U.single_threaded_session():
        if not os.path.isdir(arglist.save_dir):
            os.makedirs(arglist.save_dir)
        if not os.path.isdir(arglist.benchmark_dir):
            os.makedirs(arglist.benchmark_dir)
        if not os.path.isdir(arglist.plots_dir):
            os.makedirs(arglist.plots_dir)

        #tensorboard
        summary_writer = tf.summary.FileWriter(
            "./" + arglist.exp_name + "_graph/",
            U.get_session().graph)
        reward_plot = None
        reward_summary = tf.Summary()
        reward_summary.value.add(tag='reward', simple_value=reward_plot)

        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        """
        #### USE RVO 
        """
        use_rvo_range = -1  # if want to use rvo, set 0.28

        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]

            if use_rvo_range < 0:
                new_obs_n, rew_n, done_n, info_n = env.step(action_n,
                                                            use_rvo=None)
            else:
                # use_rvo list
                total_rvo_list = []
                for obs in obs_n:
                    agent_pos = obs[-2 * (env.world.num_agents - 1)::]
                    obst_pos = obs[-2 * (env.world.num_agents +
                                         env.world.num_obstacles)::]
                    agent_rvo_list = []
                    for i in range(0, len(agent_pos), 2):
                        if np.sqrt(np.sum(np.square(
                                agent_pos[i:i + 2]))) < use_rvo_range:
                            agent_rvo_list.append(True)
                        else:
                            agent_rvo_list.append(False)
                    for i in range(0, len(obst_pos), 2):
                        if np.sqrt(np.sum(np.square(
                                obst_pos[i:i + 2]))) < use_rvo_range:
                            agent_rvo_list.append(True)
                        else:
                            agent_rvo_list.append(False)

                    if any(agent_rvo_list):
                        total_rvo_list.append(True)
                    else:
                        total_rvo_list.append(False)
                # environment step
                new_obs_n, rew_n, done_n, info_n = env.step(
                    action_n, use_rvo=total_rvo_list)

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # add reward to tensorboard
            reward_summary.value[0].simple_value = np.mean(
                episode_rewards[-arglist.save_rate:])
            summary_writer.add_summary(reward_summary, len(episode_rewards))

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))

                t_start = time.time()
            if terminal:
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) % 1000 == 0:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' + str(
                    len(episode_rewards))
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('saved')
            if len(episode_rewards) > arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#4
0
def train(arglist, extra_args=None):
    tf_graph = tf.Graph()
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(graph=tf_graph, config=tf_config):
        # Create environment
        env = make_env(arglist.scenario, arglist)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        if arglist.num_adversaries is None:
            arglist.num_adversaries = len([
                agent for agent in env.agents
                if (hasattr(agent, "adversary") and agent.adversary)
            ])
        arglist.num_adversaries = min(env.n, arglist.num_adversaries)
        num_adversaries = arglist.num_adversaries
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        if os.environ.get("OUTPUT_GRAPH"):
            tf.summary.FileWriter(os.path.join(logger.get_dir(), "tb"),
                                  U.get_session().graph)

        # Load previous results, if necessary
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=None)
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # print("[action] " + ", ".join(["agent {i}: {action}".format(i=i, action=list(action_n[i])) for i in range(len(action_n))]))
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                if arglist.save_render_images:
                    input_file_name = os.path.join(
                        arglist.render_dir,
                        "image-episode_{}-step_%d.png".format(
                            len(episode_rewards)))
                    output_file_name = os.path.join(
                        arglist.render_dir,
                        "video-episode_{}.mp4".format(len(episode_rewards)))
                    command = "ffmpeg -y -r 10 -i {} {}".format(
                        input_file_name, output_file_name)
                    os.system(command)
                    print("Saved render video at {}".format(output_file_name))

                    for episode_step_ in range(episode_step):
                        file_name = os.path.join(
                            arglist.render_dir,
                            "image-episode_{}-step_{}.png".format(
                                len(episode_rewards), episode_step_))
                        if os.path.exists(file_name):
                            os.remove(file_name)

                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = os.path.join(arglist.benchmark_dir,
                                             'benchmark.pkl')
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                if arglist.save_render_images:
                    images = env.render(mode="rgb_array")
                    image = images[0]
                    file_name = os.path.join(
                        arglist.render_dir,
                        "image-episode_{}-step_{}.png".format(
                            len(episode_rewards), episode_step))
                    plt.imsave(file_name, image)
                    print("Saved render image at {}".format(file_name))
                else:
                    env.render(mode="human")
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(os.path.join(
                    arglist.save_dir,
                    "checkpoint-episode_{}".format(len(episode_rewards))),
                             saver=saver)

            # print training scalars
            if terminal and ((len(episode_rewards) % arglist.print_rate == 0)
                             or
                             (len(episode_rewards) % arglist.save_rate == 0)):
                # print statement depends on whether or not there are adversaries
                logger.log("Time: {}".format(
                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
                logger.logkv("steps", train_step)
                logger.logkv("episodes", len(episode_rewards))
                logger.logkv("mean_episode_reward",
                             np.mean(episode_rewards[-arglist.save_rate:]))
                if num_adversaries == 0:
                    # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()),
                    #     train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                    pass
                else:
                    for agent_index in range(len(agent_rewards)):
                        logger.logkv(
                            "agent_{}_episode_reward".format(agent_index),
                            np.mean(agent_rewards[agent_index]
                                    [-arglist.save_rate:]))
                    # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()),
                    #     train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                    #     [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                logger.logkv("time", round(time.time() - t_start, 3))
                logger.dumpkvs()
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = os.path.join(arglist.plots_dir, 'rewards.pkl')
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = os.path.join(arglist.plots_dir,
                                               'average_rewards.pkl')
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#5
0
    def on_event(self, f):
        @inlineCallbacks
        def set_wheel(self, robot_wheels):
            yield self.call(u'aiwc.set_speed', args.key, robot_wheels)
            return

        # initiate empty frame
        received_frame = Frame()

        if 'time' in f:
            received_frame.time = f['time']
        if 'score' in f:
            received_frame.score = f['score']
        if 'reset_reason' in f:
            received_frame.reset_reason = f['reset_reason']
        if 'coordinates' in f:
            received_frame.coordinates = f['coordinates']
        if 'EOF' in f:
            self.end_of_frame = f['EOF']

        #self.printConsole(received_frame.time)
        #self.printConsole(received_frame.score)
        #self.printConsole(received_frame.reset_reason)
        #self.printConsole(self.end_of_frame)
##############################################################################
        if (self.end_of_frame):

            # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4)
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[BALL][X])
            #self.printConsole(received_frame.coordinates[BALL][Y])

            self.get_coord(received_frame)
            ##############################################################################
            # Next state, Reward, Reset
            # new_obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(self.number_of_robots)]
            new_obs_n = []
            rew_n = []
            done_n = []
            for i in range(self.number_of_robots):
                next_state = self.pre_processing(i)
                # self.printConsole(next_state)
                # new_obs_n[i] = np.append(next_state, next_state - self.obs_n[i][:-self.state_dim]) # position and velocity
                new_obs_n.append(
                    np.append(next_state,
                              next_state - self.obs_n[i][:-self.state_dim])
                )  # position and velocity
                # self.printConsole('observation ' + str(i) + ': '+ str(new_obs_n[i]))

                rew_n.append(self.get_reward(received_frame.reset_reason, i))

                if (received_frame.reset_reason != NONE):
                    done_n.append(True)
                else:
                    done_n.append(False)
            done = all(done_n)
            if done:
                self.printConsole("reset reason: " +
                                  str(received_frame.reset_reason))

            # self.printConsole('reward: ' + str(rew_n[0]))
            # rew_n = [sum(rew_n) for i in range(self.number_of_robots)]

            # for i, agent in enumerate(self.trainers):
            #     agent.experience(self.obs_n[i], self.action_n[i], rew_n[i], new_obs_n[i], done_n[i], False)
            for i in range(self.number_of_robots):
                if not self.cur_my_posture[i][ACTIVE]:
                    self.printConsole('robot ' + str(i) + ' is not active')
                    continue
                self.trainers[0].experience(self.obs_n[i], self.action_n[i],
                                            rew_n[i], new_obs_n[i], done_n[i],
                                            False)

            self.obs_n = new_obs_n

            # for i, rew in enumerate(rew_n):
            #     self.episode_rewards[-1] += rew
            #     self.agent_rewards[i][-1] += rew

            # if done:
            #     self.episode_rewards.append(0)
            #     for a in self.agent_rewards:
            #         a.append(0)
            #     self.agent_info.append([[]])
            self.reward_sum += rew_n

            # increment global step counter
            self.train_step += 1

            # update all trainers
            loss = None
            for agent in self.trainers:
                agent.preupdate()
            for agent in self.trainers:
                loss = agent.update(self.trainers, self.train_step)

            # get action
            # self.action_n = [agent.action(obs) for agent, obs in zip(self.trainers,self.obs_n)]
            self.action_n = [
                self.trainers[0].action(obs) for obs in self.obs_n
            ]
            # self.printConsole("original action: " + str(self.action_n[0]))

            for i in range(self.number_of_robots):
                self.wheels[2 * i] = self.max_linear_velocity * (
                    self.action_n[i][1] - self.action_n[i][2] +
                    self.action_n[i][3] - self.action_n[i][4])
                self.wheels[2 * i + 1] = self.max_linear_velocity * (
                    self.action_n[i][1] - self.action_n[i][2] -
                    self.action_n[i][3] + self.action_n[i][4])

            # self.printConsole("                 action: " + str(self.wheels[:2]))
            self.printConsole('step: ' + str(self.train_step))

            self.pre_ball = self.cur_ball
            set_wheel(self, self.wheels.tolist())
            ##############################################################################
            if (self.train_step % self.save_every_steps) == 0:
                U.save_state(self.arglist.save_dir, saver=self.saver)

            # if done: # plot the statics
            if (self.train_step % self.stats_steps
                ) == 0:  # plot every 6000 steps (about 5 minuites)
                self.printConsole("add data to tensorboard")
                stats = [sum(self.reward_sum)] + [
                    self.reward_sum[i] for i in range(len(self.reward_sum))
                ] + [self.score_sum]
                for i in range(len(stats)):
                    U.get_session().run(self.update_ops[i],
                                        feed_dict={
                                            self.summary_placeholders[i]:
                                            float(stats[i])
                                        })
                summary_str = U.get_session().run(self.summary_op)
                self.summary_writer.add_summary(summary_str, self.inner_step)

                self.reward_sum = np.zeros(len(self.reward_sum))
                self.score_sum = 0
                self.inner_step += 1
##############################################################################
            if (received_frame.reset_reason == GAME_END):
                #(virtual finish() in random_walk.cpp)
                #save your data
                with open(args.datapath + '/result.txt', 'w') as output:
                    #output.write('yourvariables')
                    output.close()
                #unsubscribe; reset or leave
                yield self.sub.unsubscribe()
                try:
                    yield self.leave()
                except Exception as e:
                    self.printConsole("Error: {}".format(e))

            self.end_of_frame = False
示例#6
0
文件: train.py 项目: EcustBoy/IMAC
def train(arglist):
    # random.seed(arglist.random_seed)
    # np.random.seed(arglist.random_seed)
    # tf.set_random_seed(arglist.random_seed)

    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        savers = [
            tf.train.Saver(U.scope_vars(trainer.name)) for trainer in trainers
        ]

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            # U.load_state(arglist.load_dir)
            [
                U.load_state(os.path.join(arglist.load_dir,
                                          'team_{}'.format(i)),
                             saver=saver) for i, saver in enumerate(savers)
            ]

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        if arglist.trainer == 'tarmac' or arglist.trainer == 'reuse_tarmac' or arglist.trainer == 'ibmac_inter':
            message_n = np.zeros([len(obs_n), 4])
        is_training = True

        t_start = time.time()

        writer = tf.summary.FileWriter("graph", U.get_session().graph)
        writer.close()

        writer = SummaryWriter(arglist.save_dir)

        print('Starting iterations...')
        while True:
            # get action
            if arglist.trainer == 'ibmac' or arglist.trainer == 'reuse_ibmac':
                is_inference = False
                if arglist.display or arglist.restore or arglist.benchmark:
                    is_inference = False
                if len(trainers) == 2:
                    action_n1 = trainers[0].action(obs_n[:num_adversaries],
                                                   is_inference=is_inference)
                    action_n2 = trainers[1].action(obs_n[num_adversaries:],
                                                   is_inference=is_inference)
                    action_n = [action[0] for action in action_n1
                                ] + [action[0] for action in action_n2]
                else:
                    action_n = trainers[0].action(obs_n,
                                                  is_inference=is_inference)
                    action_n = [action[0] for action in action_n]
            elif arglist.trainer == 'ibmac_inter':
                if len(trainers) == 2:
                    action_n1, message_action_n1 = trainers[0].action(
                        obs_n[:num_adversaries], message_n[:num_adversaries])
                    action_n2, message_action_n2 = trainers[1].action(
                        obs_n[num_adversaries:], message_n[num_adversaries:])
                    action_n = [action[0] for action in action_n1
                                ] + [action[0] for action in action_n2]
                else:
                    action_n, message_action_n = trainers[0].action(
                        obs_n, message_n)
                    action_n = [action[0] for action in action_n]
                    message_n = [
                        message_action[0]
                        for message_action in message_action_n
                    ]
            else:
                action_n = [
                    agent.action(obs) for agent, obs in zip(trainers, obs_n)
                ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            if arglist.trainer == 'ibmac':
                if len(trainers) == 2:
                    trainers[0].experience(obs_n[:num_adversaries],
                                           action_n[:num_adversaries],
                                           rew_n[:num_adversaries],
                                           new_obs_n[:num_adversaries],
                                           done_n[:num_adversaries], terminal)
                    trainers[1].experience(obs_n[num_adversaries:],
                                           action_n[num_adversaries:],
                                           rew_n[num_adversaries:],
                                           new_obs_n[num_adversaries:],
                                           done_n[num_adversaries:], terminal)
                else:
                    trainers[0].experience(obs_n, action_n, rew_n, new_obs_n,
                                           done_n, terminal)
            elif arglist.trainer == 'ibmac_inter':
                if len(trainers) == 2:
                    trainers[0].experience(obs_n[:num_adversaries],
                                           message_n[:num_adversaries],
                                           action_n[:num_adversaries],
                                           rew_n[:num_adversaries],
                                           new_obs_n[:num_adversaries],
                                           done_n[:num_adversaries], terminal)
                    trainers[1].experience(obs_n[num_adversaries:],
                                           message_n[:num_adversaries],
                                           action_n[num_adversaries:],
                                           rew_n[num_adversaries:],
                                           new_obs_n[num_adversaries:],
                                           done_n[num_adversaries:], terminal)
                else:
                    trainers[0].experience(obs_n, message_n, action_n, rew_n,
                                           new_obs_n, done_n, terminal)
            else:
                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                     new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for i, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if loss:
                    if isinstance(agent, IBMACAgentTrainer) or isinstance(
                            agent, ReuseIBMACAgentTrainer):
                        q_loss, p_loss, _, _, _, _, kl_loss = loss
                        writer.add_scalar('agent_{}/loss_kl'.format(i),
                                          kl_loss, train_step)
                    else:
                        q_loss, p_loss, _, _, _, _ = loss
                    writer.add_scalar('agent_{}/loss_policy'.format(i), p_loss,
                                      train_step)
                    writer.add_scalar('agent_{}/loss_critic'.format(i), q_loss,
                                      train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                [
                    U.save_state(os.path.join(arglist.save_dir,
                                              'team_{}'.format(i)),
                                 saver=saver) for i, saver in enumerate(savers)
                ]
                # print statement depends on whether or not there are adversaries

                for i in range(len(agent_rewards)):
                    writer.add_scalar(
                        'agent_{}/mean_episode_reward'.format(i),
                        np.mean(agent_rewards[i][-arglist.save_rate:]),
                        len(episode_rewards))

                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
示例#7
0
    def on_event(self, f):
        @inlineCallbacks
        def set_wheel(self, robot_wheels):
            yield self.call(u'aiwc.set_speed', args.key, robot_wheels)
            return

        # initiate empty frame
        received_frame = Frame()

        if 'time' in f:
            received_frame.time = f['time']
        if 'score' in f:
            received_frame.score = f['score']
        if 'reset_reason' in f:
            received_frame.reset_reason = f['reset_reason']
        if 'coordinates' in f:
            received_frame.coordinates = f['coordinates']
        if 'EOF' in f:
            self.end_of_frame = f['EOF']

        #self.printConsole(received_frame.time)
        #self.printConsole(received_frame.score)
        #self.printConsole(received_frame.reset_reason)
        #self.printConsole(self.end_of_frame)
##############################################################################
        if (self.end_of_frame):

            # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4)
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[BALL][X])
            #self.printConsole(received_frame.coordinates[BALL][Y])

            self.get_coord(received_frame)
            ##############################################################################
            # Next state, Reward, Reset
            if self.done:
                self.control_idx += 1
                self.control_idx %= 5

            # Next state
            next_obs = self.pre_processing(self.control_idx)
            if self.done:
                next_state = np.append(next_obs,
                                       next_obs)  # 2 frames position stack
                self.done = False
            else:
                next_state = np.append(
                    next_obs,
                    self.state[:-self.state_dim])  # 2 frames position stack

            # Reward
            reward = self.get_reward(received_frame.reset_reason,
                                     self.control_idx)

            # Reset
            if (received_frame.reset_reason !=
                    NONE) and (received_frame.reset_reason is not None):
                self.done = True
                self.printConsole("reset reason: " +
                                  str(received_frame.reset_reason))
            else:
                self.done = False

            self.state = next_state

            # get action
            self.action = self.trainers.action(self.state)

            self.wheels = np.zeros(self.number_of_robots * 2)
            self.wheels[2*self.control_idx] = self.max_linear_velocity * \
                    (self.action[1]-self.action[2]+self.action[3]-self.action[4])
            self.wheels[2*self.control_idx + 1] = self.max_linear_velocity * \
                    (self.action[1]-self.action[2]-self.action[3]+self.action[4])

            # Send non-control robot to the side of the field
            for i in range(self.number_of_robots):
                if i == self.control_idx:
                    continue
                else:
                    if (i == 0) or (i == 2):
                        x = self.cur_my[i][X]
                        y = -1.35
                    elif (i == 1) or (i == 3):
                        x = self.cur_my[i][X]
                        y = 1.35
                    else:
                        x = -2.1
                        y = 0
                    self.position(i, x, y)

            # increment global step counter
            # Increase count only when the control robot is active.
            if self.cur_my[self.control_idx][ACTIVE]:
                self.train_step += 1
                self.rwd_sum += reward
                self.printConsole('step: ' + str(self.train_step))

                set_wheel(self, self.wheels.tolist())
##############################################################################
# plot every 6000 steps (about 5 minuites)
            if ((self.train_step % self.stats_steps) == 0) \
                            and (self.train_step < 1992001):
                stats = [self.rwd_sum]
                for i in range(len(stats)):
                    U.get_session().run(self.update_ops[i],
                                        feed_dict={
                                            self.summary_placeholders[i]:
                                            float(stats[i])
                                        })
                summary_str = U.get_session().run(self.summary_op)
                self.summary_writer.add_summary(summary_str,
                                                self.train_step - 6000)

                self.rwd_sum = 0

                # load new model
                print('Loading %s' % self.train_step)
                U.load_state("./save_model/aiwc_maddpg-%s" % self.train_step)
##############################################################################
            if (received_frame.reset_reason == GAME_END):
                #(virtual finish() in random_walk.cpp)
                #save your data
                with open(args.datapath + '/result.txt', 'w') as output:
                    #output.write('yourvariables')
                    output.close()
                #unsubscribe; reset or leave
                yield self.sub.unsubscribe()
                try:
                    yield self.leave()
                except Exception as e:
                    self.printConsole("Error: {}".format(e))

            self.end_of_frame = False
示例#8
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist)
        n = len(env.agents)

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(n)]
        trainers = []
        for i in range(n):
            trainers.append(
                MADDPGAgentTrainer(
                    "agent_%d" % i,
                    mlp_model,
                    obs_shape_n,
                    env.action_space,
                    i,
                    arglist,
                    local_q_func=False
                )
            )

        saver = tf.train.Saver(max_to_keep=None)

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.restore:
            print('Loading previous state...')
            saver.restore(U.get_session(), arglist.load_dir)

        rewards = np.zeros((1, n))  # agent reward per step
        obs_n = env.reset()
        episode_number = 0
        episode_step = 0
        train_step = 0
        t_start = time.time()

        # stats buffers
        step_info = {
            'dist':     np.zeros((arglist.max_episode_len, n, n)),
            'speed':    np.zeros((arglist.max_episode_len, n,)),
            'health':   np.zeros((arglist.max_episode_len, n,)),
            'fire':     np.zeros((arglist.max_episode_len, n,)),
            'bite':     np.zeros((arglist.max_episode_len, n, n)),
            'hit':      np.zeros((arglist.max_episode_len, n, n))
        }
        episode_info = {
            'dist':     np.zeros((arglist.num_episodes, n, n)),
            'speed':    np.zeros((arglist.num_episodes, n,)),
            'health':   np.zeros((arglist.num_episodes, n,)),
            'fire':     np.zeros((arglist.num_episodes, n,)),
            'bite':     np.zeros((arglist.num_episodes, n, n)),
            'hit':      np.zeros((arglist.num_episodes, n, n))
        }

        print('Starting iterations...')
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]

            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            # update episode step stats
            for key in step_info:
                step_info[key][episode_step] = info_n[key]

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            # record reward
            rewards[-1, :] += rew_n

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                rewards = np.concatenate((rewards, np.zeros((1, n))))

                # aggregate step_info
                episode_info['dist'][episode_number] = np.mean(step_info['dist'], axis=0)
                episode_info['speed'][episode_number] = np.mean(step_info['speed'], axis=0)
                episode_info['health'][episode_number] = np.min(step_info['health'], axis=0)
                episode_info['fire'][episode_number] = np.sum(step_info['fire'], axis=0)
                episode_info['bite'][episode_number] = np.sum(step_info['bite'], axis=0)
                episode_info['hit'][episode_number] = np.sum(step_info['hit'], axis=0)

                # reset step_info
                for key in step_info: step_info[key][:] = 0.

            # increment global step counter
            train_step += 1

            # for displaying policies while training
            if arglist.display and (episode_number % arglist.display_rate == 0) and episode_number > 0 and er_fill_frac_min >= 1.0:
            #if arglist.display and (episode_number % 5 == 0) and episode_number > 0:
                time.sleep(0.1)
                env.render()

            # update all trainers
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal:
                # check replay buffer status
                er_status = np.array([[len(t.replay_buffer), t.max_replay_buffer_len] for t in trainers])
                er_fill_frac = er_status[:, 0] / er_status[:, 1]
                er_fill_frac_min = er_fill_frac[np.argmin(er_fill_frac)]

                # print progress
                offset = -1 if train_step == 1 else -2
                print("steps: {}\tepisode: {}\treplay: {:.2f}%\treward: {}\ttime: {}".format(
                    train_step,
                    episode_number,
                    er_fill_frac_min * 100,
                    "\t".join(['[', *["%.2f" % r for r in list(rewards[offset])], ']']),
                    round(time.time()-t_start, 3))
                )

                t_start = time.time()

                # save state
                if (episode_number % arglist.save_rate == 0) and er_fill_frac_min >= 1.0:
                    print("saving...", end='')
                    # save policy snapshot
                    snapshot_folder = "{}/{}".format(arglist.save_dir, arglist.exp_name)
                    os.makedirs(snapshot_folder, exist_ok=True)
                    saver.save(U.get_session(), snapshot_folder + '/session', global_step=episode_number)
                    # save rewards
                    rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                    with open(rew_file_name, 'wb') as fp:
                        pickle.dump(rewards, fp)
                    # save stats
                    for key in episode_info:
                        stats_file_name = "{}{}_{}.pkl".format(arglist.plots_dir, arglist.exp_name, key)
                        with open(stats_file_name, 'wb') as fp:
                            pickle.dump(episode_info[key], fp)
                    print("done")

                episode_number += 1

            # saves final episode reward for plotting training curve later
            if episode_number == arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(episode_number))
                break
        def init_variables(self, info):
            # Here you have the information of the game (virtual init() in random_walk.cpp)
            # List: game_time, goal, number_of_robots, penalty_area, codewords,
            #       robot_height, robot_radius, max_linear_velocity, field, team_info,
            #       {rating, name}, axle_length, resolution, ball_radius
            # self.game_time = info['game_time']
            self.field = info['field']
            self.robot_size = 2 * info['robot_radius']
            self.goal = info['goal']
            self.max_linear_velocity = info['max_linear_velocity']
            self.number_of_robots = info['number_of_robots']
            self.end_of_frame = False

            ##################################################################
            # team info, 5 robots, (x,y,th,active,touch)
            self.cur_my = [[] for _ in range(self.number_of_robots)]

            self.cur_ball = []  # ball (x,y) position
            self.prev_ball = [0., 0.]  # previous ball (x,y) position

            # distance to the ball
            self.dist_ball = np.zeros(self.number_of_robots)
            # index for which robot is close to the ball
            self.idxs = [i for i in range(self.number_of_robots)]

            self.dlck_cnt = 0  # deadlock count
            # how many times avoid deadlock function was called
            self.avoid_dlck_cnt = 0

            self.wheels = np.zeros(self.number_of_robots * 2)
            ##################################################################
            self.state_dim = 2  # relative ball
            self.history_size = 2  # frame history size
            self.action_dim = 2  # 2

            # Histories of five robots.
            self.state = [np.zeros([self.state_dim * self.history_size]) \
                                    for _ in range(self.number_of_robots)]

            self.arglist = Argument()

            # state dimension
            self.state_shape = (self.state_dim * self.history_size, )
            self.act_space = [Discrete(self.action_dim * 2 + 1)]
            self.trainers = MADDPGAgentTrainer('agent_moving',
                                               self.mlp_model,
                                               self.state_shape,
                                               self.act_space,
                                               0,
                                               self.arglist,
                                               local_q_func=False)
            ##################################################################
            self.load_step_list = np.loadtxt('./test_step_list.txt')
            self.step_idx = 0  # For self.load_step_list

            # # Load previous results.
            if self.arglist.restore:
                self.printConsole('Loading previous state... %d' % \
                                                self.load_step_list[self.step_idx])
                U.load_state('./save_model/aiwc_maddpg-%d' % \
                                                self.load_step_list[self.step_idx])
            ##################################################################
            # for tensorboard
            self.summary_placeholders, self.update_ops, self.summary_op = \
                                                            self.setup_summary()
            self.summary_writer = \
                tf.summary.FileWriter('summary/moving_test', U.get_session().graph)
            ##################################################################
            self.test_step = 0
            self.stats_steps = 12000  # For tensorboard, about 10 minutes

            self.scr_my = 0.  # my team score
            self.scr_op = 0.  # op team score
            self.scr_sum = 0  # score sum

            self.reset = False
            ##################################################################
            self.cur_time = time.time()  # For check time to take
            return
    def on_event(self, f):
        @inlineCallbacks
        def set_wheel(self, robot_wheels):
            yield self.call(u'aiwc.set_speed', args.key, robot_wheels)
            return

        def avoid_goal_foul(self):
            midfielder(self, self.idxs[0])
            midfielder(self, self.idxs[1])
            self.position(self.idxs[2], 0, 0)
            self.position(self.idxs[3], 0, 0)
            self.position(self.idxs[4], 0, 0)

        def avoid_penalty_foul(self):
            midfielder(self, self.idxs[0])
            midfielder(self, self.idxs[1])
            midfielder(self, self.idxs[2])
            self.position(self.idxs[3], 0, 0)
            self.position(self.idxs[4], 0, 0)

        def avoid_deadlock(self):
            self.position(0, self.cur_ball[X], 0)
            self.position(1, self.cur_ball[X], 0)
            self.position(2, self.cur_ball[X], 0)
            self.position(3, self.cur_ball[X], 0)
            self.position(4, self.cur_ball[X], 0)

            # if closest ball is somhow away from the ball
            # or avoided deadlock to some extent
            if (self.dist_ball[self.idxs[0]] > 0.13) or (self.avoid_dlck_cnt >
                                                         20):
                offense(self)

        def midfielder(self, robot_id):
            goal_dist = helper.distance(self.cur_my[robot_id][X],
                                        self.field[X] / 2,
                                        self.cur_my[robot_id][Y], 0)
            shoot_mul = 1
            dribble_dist = 0.426
            v = 5
            goal_to_ball_unit = helper.unit(
                [self.field[X] / 2 - self.cur_ball[X], -self.cur_ball[Y]])
            delta = [
                self.cur_ball[X] - self.cur_my[robot_id][X],
                self.cur_ball[Y] - self.cur_my[robot_id][Y]
            ]

            if (self.dist_ball[robot_id] < 0.5) and (delta[X] > 0):
                self.position(robot_id, self.cur_ball[X] + v * delta[X],
                              self.cur_ball[Y] + v * delta[Y])
            else:
                self.position(
                    robot_id,
                    self.cur_ball[X] - dribble_dist * goal_to_ball_unit[X],
                    self.cur_ball[Y] - dribble_dist * goal_to_ball_unit[Y])

        def offense(self):
            midfielder(self, 0)
            midfielder(self, 1)
            midfielder(self, 2)
            midfielder(self, 3)
            midfielder(self, 4)

        def set_formation(self):
            # count how many robots is in the goal area
            goal_area_cnt = self.count_goal_area()
            # count how many robots is in the penalty area
            penalty_area_cnt = self.count_penalty_area()
            self.count_deadlock()

            if goal_area_cnt > 2:
                avoid_goal_foul(self)
                self.printConsole('avoid goal foul')
            elif penalty_area_cnt > 3:
                avoid_penalty_foul(self)
                self.printConsole('avoid penalty foul')
            elif self.dlck_cnt > 15:
                avoid_deadlock(self)
                self.printConsole('avoid deadlock')
                self.avoid_dlck_cnt += 1
            else:
                offense(self)
                self.printConsole('offense')

        # initiate empty frame
        received_frame = Frame()

        if 'time' in f:
            received_frame.time = f['time']
        if 'score' in f:
            received_frame.score = f['score']
        if 'reset_reason' in f:
            received_frame.reset_reason = f['reset_reason']
        if 'coordinates' in f:
            received_frame.coordinates = f['coordinates']
        if 'EOF' in f:
            self.end_of_frame = f['EOF']

        #self.printConsole(received_frame.time)
        #self.printConsole(received_frame.score)
        #self.printConsole(received_frame.reset_reason)
        #self.printConsole(self.end_of_frame)
##############################################################################
        if (self.end_of_frame):

            # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4)
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[BALL][X])
            #self.printConsole(received_frame.coordinates[BALL][Y])
            ##############################################################################
            self.get_coord(received_frame)
            self.idxs = self.get_idxs()

            # Reset
            if (received_frame.reset_reason == SCORE_MYTEAM):
                self.reset = True
                self.scr_my += 1
                self.scr_sum += 1
                self.printConsole("reset reason: " + \
                    str(received_frame.reset_reason))
            elif (received_frame.reset_reason == SCORE_OPPONENT):
                self.reset = True
                self.scr_op += 1
                self.scr_sum -= 1
                self.printConsole("reset reason: " + \
                    str(received_frame.reset_reason))
            elif(received_frame.reset_reason != NONE) or \
                                    (received_frame.reset_reason == None):
                self.reset = True
                self.printConsole("reset reason: " + \
                    str(received_frame.reset_reason))
            else:
                self.reset = False

            set_formation(self)  # rule-based formation
            set_wheel(self, self.wheels.tolist())
            self.prev_ball = self.cur_ball

            # increment global step counter
            self.test_step += 1
            self.printConsole('step: ' + str(self.test_step))
            if (self.test_step % 1200) == 0:
                self.printConsole('%d seconds' % (time.time() - self.cur_time))
                self.cur_time = time.time()
##############################################################################
# plot every 72000 steps (about 10 minutes)
            if ((self.test_step % self.stats_steps)
                    == 0) and (self.step_idx < 20):
                score_ratio = self.scr_my / self.scr_op \
                                if self.scr_op != 0. else 100

                stats = [score_ratio, self.scr_sum]
                for i in range(len(stats)):
                    U.get_session().run(self.update_ops[i],
                                        feed_dict={
                                            self.summary_placeholders[i]:
                                            float(stats[i])
                                        })
                summary_str = U.get_session().run(self.summary_op)
                self.summary_writer.add_summary(
                    summary_str, self.load_step_list[self.step_idx])

                self.step_idx += 1
                self.scr_my, self.scr_op, self.scr_sum = 0, 0, 0

                # load new model
                print('Loading %s' % self.load_step_list[self.step_idx])
                U.load_state('./save_model/aiwc_maddpg-%d' % \
                                self.load_step_list[self.step_idx])
##############################################################################
            if (received_frame.reset_reason == GAME_END):
                #(virtual finish() in random_walk.cpp)
                #save your data
                with open(args.datapath + '/result.txt', 'w') as output:
                    #output.write('yourvariables')
                    output.close()
                #unsubscribe; reset or leave
                yield self.sub.unsubscribe()
                try:
                    yield self.leave()
                except Exception as e:
                    self.printConsole("Error: {}".format(e))

            self.end_of_frame = False