예제 #1
0
def collect_stats(agent: DQNAgent, n_games=1000):
    MAX_STEPS = 1000
    lenghts = []
    looped = 0
    for i in range(1, n_games+1):
        env = gym.make('snake-v0')
        # env.__init__(human_mode=False)
        observation = env.reset()
        done = False
        steps = 0
        agent.epsilon = 0.0
        state = agent.get_last_observations(observation)
        while not done and steps < MAX_STEPS:
            action = agent.act(state)
            next_observation, _, done, _ = env.step(action)
            state = agent.get_last_observations(next_observation)
            steps += 1

        if steps == MAX_STEPS:
            looped += 1
        else:
            lenghts.append(len(env.game.snake.body))

        if i % (n_games//10) == 0:
            print(f"Avg len: {sum(lenghts) / len(lenghts):.2f}, looped {looped}/{i}")
예제 #2
0
def run_episode(environment: gym.Env, agent: DQNAgent, render: bool,
                max_length: int):
    """
    Run one episode in the given environment with the agent.

    Arguments:
        environment {`gym.Env`} -- Environment representing the Markov Decision Process
        agent {`DQNAgent`} -- Reinforcment Learning agent that acts in the envíronment
        render {`bool`} -- Whether the frames of the episode should be rendered on the screen
        max_length {`int`} -- Maximum number of steps before the episode is terminated

    Returns:
        `float` -- Cumulated reward that the agent received during the episode
    """
    episode_reward = 0
    state = environment.reset()
    for _ in range(max_length):
        if render:
            environment.render()
        action = agent.act(state)
        next_state, reward, terminal, _ = environment.step(action)
        agent.observe(
            Transition(state, action, reward,
                       None if terminal else next_state))
        episode_reward += reward
        if terminal:
            break
        else:
            state = next_state
    return episode_reward
예제 #3
0
class DQNScheduler:
    def __init__(self, simulator):
        self.agent = DQNAgent(25, 6)
        self.agent.load("./save/car-100-dqn.h5")
        self.simulator = simulator
        self.agent.epsilon = 0

    def schedule(self):
        action = self.agent.act(np.reshape(self.simulator.get_state(),
                                           [1, 25]))
        return action
예제 #4
0
def _run_agent_one_ep(env: BaseEnv,
                      agent: DQNAgent,
                      config: Config,
                      eps: float,
                      behavior_name: str,
                      train: Optional[bool] = True):
    # Get a starting state
    env.reset()

    decision_steps, terminal_steps = env.get_steps(behavior_name)
    state = decision_steps.obs[0]

    agent_id = decision_steps.agent_id[0]
    done = False
    did_win = False
    episode_reward = 0.0
    import time
    while not done:
        reward = 0.0
        # Get and perform an action
        action = agent.act(decision_steps.obs[0], eps)
        env.set_actions(behavior_name,
                        np.expand_dims(action, 0).reshape(-1, 1))
        env.step()

        decision_steps, terminal_steps = env.get_steps(behavior_name)
        # Determine S', R, Done
        next_state = None
        if agent_id in decision_steps:
            reward += decision_steps.reward[0]
            next_state = decision_steps.obs[0]
        if agent_id in terminal_steps:
            terminal_reward = terminal_steps.reward[0]
            # Add win/loss
            did_win = True if math.isclose(terminal_reward, 1.0) else False
            reward += terminal_reward
            next_state = terminal_steps.obs[0]
            done = True

        assert next_state is not None, f"next_state cannot be None. Agent {agent_id} did not appear in decision or terminal steps"

        if train:
            # Learn from (S, A, R, S')
            experience = Experience(state, action, reward, next_state, done)
            agent.step(experience)

        # Set new state
        state = next_state

        episode_reward += reward

    return (episode_reward, did_win)
예제 #5
0
def watch_agent(agent: DQNAgent):
    env = gym.make('snake-v0')
    env.__init__(human_mode=True)
    observation = env.reset()
    renderer=Renderer(env.game)
    try:
        done = False
        steps = 0
        agent.epsilon = 0
        state = agent.get_last_observations(observation)
        while not done:
            # time.sleep(0.001)
            renderer.render_frame()
            action = agent.act(state)
            next_observation, _, done, _ = env.step(action)
            state = agent.get_last_observations(next_observation)
            steps += 1
    finally:
        renderer.close_window()
    print(f"Snake length: {len(env.game.snake.body)}")
    print(f"Simulation ended after {steps} steps.")
예제 #6
0
def main(argv):
    args = parser.parse_args(argv[1:])

    if args.usage == 'help':
        return parser.print_help()

    if is_environments_gen(args):
        _write_env_file(args)
    elif is_environments_list(args):
        all_registry = registry.all()
        registry_envs_name = [
            trim_env_spec_name(env.__repr__()) for env in all_registry
        ]
        for environment in registry_envs_name:
            print(environment)
    elif is_environments_act(args):
        env = gym.make(args.environment_name)
        if is_action_type('dqn', args):
            if args.pre_defined_state_size == 'nesgym':
                pre_state_size = 172032
            elif args.pre_defined_state_size == 'gym':
                pre_state_size = env.observation_space.shape[0]
            elif args.pre_defined_state_size == 'gym-atari':
                pre_state_size = 100800
            elif args.pre_defined_state_size == 'gym-atari-extend':
                pre_state_size = 120000
            elif args.pre_defined_state_size == 'gym-atari-small':
                pre_state_size = 100800
            elif args.pre_defined_state_size == 'gym-gomoku':
                pre_state_size = 361
            # state_size = (1,) + env.observation_space.shape
            state_size = pre_state_size
            action_size = env.action_space.n
            agent = DQNAgent(state_size, action_size)
            # try:
            #     agent.load('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps,
            #                                           args.i_episodes))
            # except Exception:
            #     pass
            done = False
            batch_size = 64
        i_episodes = args.i_episodes
        timesteps = args.timesteps
        factor = args.seed_factor
        for i_episode in range(i_episodes):
            state = env.reset()
            if is_action_type('dqn', args):
                state = np.reshape(state, [1, pre_state_size])
            for t in range(timesteps):
                try:
                    if args.render == 'present': env.render()
                    if args.render == 'presented': env.render(args.render)
                    if args.action_type == 'alternate':
                        action_choice = i_episodes * 2
                        action = random_action_space_sample_choice(
                            action_choice, env, factor)
                    elif args.action_type == 'specific':
                        action = env.action_space.sample()
                    elif args.action_type == 'conditional':
                        action_choice = i_episodes
                        action = random_action_space_sample_choice(
                            action_choice, env, factor)
                    elif args.action_type == 'numerical':
                        action = env.action_space.n
                    elif is_action_type('dqn', args) and len(state) == 5:
                        action = agent.act(state)
                    elif is_action_type('dqn', args) and len(state) != 5:
                        action = env.action_space.sample()
                    collect_stat(action, ['input', 'actions'], stats)
                    observation, reward, done, info = env.step(action)
                    if is_action_type('dqn', args):
                        reward = reward if not done else -10
                        observation = np.reshape(observation,
                                                 [1, pre_state_size])
                        agent.remember(state, action, reward, observation,
                                       done)
                        state = observation
                    # collect_stat(observation,['observation'],stats)
                    collect_stat(reward, ['rewards'], stats)
                    # collect_stat(done,['output','done'],stats)
                    # collect_stat(info,['output','info'],stats)
                    if done:
                        max_episodes_range = (i_episodes - 1)
                        episode_timesteps_iteration_limit = max_episodes_range - 1
                        is_latest_episode = is_filled_latest_episode_with_iteration(
                            i_episode, episode_timesteps_iteration_limit)
                        increased_timestep = increase_timestep(t)
                        print('i_episode {}'.format(i_episode))
                        print('Episode finished after {} timesteps'.format(
                            increased_timestep))
                        if is_action_type('dqn', args):
                            print('Episode: {}/{}, score: {}, e: {:.2}'.format(
                                i_episode, i_episodes, t, agent.epsilon))
                        collect_stat(t, ['output', 'timestep', 'iteration'],
                                     stats)
                        collect_stat(increased_timestep,
                                     ['output', 'timestep', 'increased'],
                                     stats)
                        is_latest_episode_to_save_state = lambda args_cached: is_latest_episode and args_cached.output_stats_filename
                        if is_latest_episode_to_save_state(args):
                            filename = args.output_stats_filename
                            pre_df = {
                                # 'observations': stats['observations'],
                                'rewards': stats['rewards'],
                                # 'done-output': stats['output']['done'],
                                # 'info-output': stats['output']['info'],
                                # 'iteration-timestep': stats['output']['timestep']['iteration'],
                                # 'increased-timestep': stats['output']['timestep']['increased'],
                                'actions-input': stats['input']['actions']
                            }
                            df = pd.DataFrame(pre_df)
                            stamp = lambda: '%s' % (int(datetime.now().
                                                        timestamp()))
                            with open(
                                    'data/{}-{}.csv'.format(stamp(), filename),
                                    'w') as f:
                                f.write(df.to_csv())
                                f.close()
                            print('Statistics file saved ({}.csv)!'.format(
                                filename))
                            del df
                            del filename
                        print(check_output_env_label())
                        del is_latest_episode_to_save_state
                        del increased_timestep
                        del is_latest_episode
                        del episode_timesteps_iteration_limit
                        del max_episodes_range
                        break
                except Exception as e:
                    print('Rendering execution ({})'.format(e))
                finally:
                    print('Execution of timestep done')
            if is_action_type('dqn',
                              args) and (len(agent.memory) > batch_size):
                agent.replay(batch_size)
        # agent.save('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps,
        #                                       args.i_episodes))
        # env.close()
    else:
        parser.print_help()
예제 #7
0
파일: test.py 프로젝트: aaiteam/code_gen
def main():
    print "Creating DQN agent..."
    # env = gym.make("codegen-v0")
    set_debugger_org_frc()

    iters = 6300
    n_goal = 0
    n_goal_all = 0
    time_stamp = 0

    max_steps = 5
    agent = DQNAgent(max_steps)
    agent.dqn.initial_exploration = 6000 * max_steps

    for iter in range(iters):
        print "\n********Iteration # ", iter, "***********\n"
        # 1 iteration
        env = gym.make("codegen-v0")
        num = random.randrange(1, 100)
        print "Goal Number : ", num + 1
        env.my_input = num
        #env.goal = "['" + env.my_input + "']"
        env.goal = str(num + 1)

        code = env._reset()
        step_in_episode = 0
        total_score = 0.0
        reward = 0.0
        mystate = []
        my_state_new = []

        # debug : the sys
        # sss = []
        # for arg in sys.argv[1:]:
        #    sss.append(arg)
        # print "sss = " , sss

        # while True:
        while step_in_episode < max_steps:

            # state = env.code_index_list + [-1]*(max_steps-len(env.code_index_list
            state = env.code_index_list[:]
            state += np.zeros([
                max_steps - len(env.code_index_list), agent.dqn.code_idx_size
            ],
                              dtype=int).tolist()
            # state = state.tolist()
            # state = 1;
            # print "env = ",env.code_index_list
            # print "state = ",state
            # raw_input()

            if step_in_episode == 0:
                action_idx = agent.start(code, state)
            else:
                action_idx = agent.act(code, state, reward)

            code, reward, terminal, info = env._step(action_idx,
                                                     agent.dqn.actions)
            state_prime = env.code_index_list[:]
            state_prime += np.zeros([
                max_steps - len(env.code_index_list), agent.dqn.code_idx_size
            ],
                                    dtype=int).tolist()

            # debug : the sys
            # sss = []
            # for arg in sys.argv[1:]:
            #    sss.append(arg)
            # print "sss = " , sss

            print "state : "
            print state
            print "state' : "
            print state_prime

            if step_in_episode == max_steps - 1:
                agent.dqn.stock_experience(agent.dqn.time_stamp, state,
                                           action_idx, reward, state_prime, 1)
            else:
                agent.dqn.stock_experience(agent.dqn.time_stamp, state,
                                           action_idx, reward, state_prime, 0)

            agent.dqn.experience_replay(agent.dqn.time_stamp)

            agent.dqn.target_model_update(agent.dqn.time_stamp,
                                          soft_update=False)

            total_score += reward

            if terminal:

                agent.dqn.goal_idx.append(agent.dqn.time_stamp)

                agent.end(reward)
                agent.dqn.stock_experience(agent.dqn.time_stamp, state,
                                           action_idx, reward, state_prime, 1)

                n_goal_all += 1
                step_in_episode += 1
                agent.dqn.time_stamp += 1

                if iters - iter <= 100:
                    n_goal += 1

                break

            step_in_episode += 1
            agent.dqn.time_stamp += 1

        if iter == 1 + (agent.dqn.initial_exploration / max_steps):
            print "n_goal_all = ", n_goal_all
            print agent.dqn.goal_idx
            raw_input()

    print "n_goal : ", n_goal
    print "epsilon : ", agent.epsilon
예제 #8
0
class VideoStreamingTest(object):
    def __init__(self, host, port):
        self.state_size = 3
        self.action_size = 7
        self.done = False
        self.batch_size = 32
        self.agent = DQNAgent(self.state_size, self.action_size)
        self.state_now = np.reshape([0.10606659, -0.52737298, 0.47917915],
                                    [1, self.state_size])
        self.state_last = np.reshape([0.10606659, -0.52737298, 0.47917915],
                                     [1, self.state_size])
        self.action_for_next = 0
        self.action_for_now = 0
        self.reward = 0
        self.forward = "T394"
        self.left = "S450"
        self.right = "S270"
        self.backward = "T330"
        self.stop = "T370"
        self.middle = "S360"
        #dqn parameters
        self.server_socket = socket.socket()
        self.server_socket.bind((host, port))
        self.server_socket.listen(0)
        self.connection, self.client_address = self.server_socket.accept()
        self.connection = self.connection.makefile("rb")
        self.host_name = socket.gethostname()
        self.host_ip = socket.gethostbyname(self.host_name)
        self.temp_result = None
        self.finnal_result = None
        self.RANGE = 350
        self.WIDTH = 720
        self.time_now = 0
        self.count = 0
        self.streaming()

    def dqn_loop(self):
        if self.finnal_result['me']['r'] > 1:
            self.done = True
        else:
            self.done = False
        if True:
            self.prepare_state()  #更新前一次状态,并获取这一次状态
            self.prepare_action()  #更新前一次动作,并获取本次操作

            if self.count == 1:
                self.prepare_reward()  #获取上一次活动的奖励
            else:
                self.count += 1
            self.act_move()  #更新小车运动状态
            if self.count == 1:
                self.remember_step()  #收集本次数据
            if len(self.agent.memory) > self.batch_size:
                self.agent.replay(self.batch_size)

    def prepare_state(self):
        self.state_last = self.state_now
        state_now_ = [self.finnal_result['me']['alpha_big'], \
        self.finnal_result['me']['alpha_small'], \
        self.finnal_result['me']['r']]
        self.state_now = np.reshape(state_now_, [1, self.state_size])
        #self.state_now = state_now_

    def prepare_action(self):
        self.action_for_now = self.action_for_next
        self.action_for_next = self.agent.act(self.state_now)

    def prepare_reward(self):  #运行条件:state_last非空
        if self.done:
            self.reward = -10
        else:
            self.reward = (self.state_last[0][2] - self.state_now[0][2]) * 100
            #self.reward = (self.state_last[2] - self.state_now[2])*100
    def remember_step(self):
        self.agent.remember(self.state_last, self.action_for_now, self.reward,
                            self.state_now, self.done)

    def act_move(self):
        if self.done:
            self.action_for_next = 0

        if self.action_for_next == 0:  #停止
            str_S = self.middle
            str_T = self.stop
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)

        elif self.action_for_next == 1:  #前进
            str_S = self.middle
            str_T = self.forward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)

        elif self.action_for_next == 2:  #左转前进
            str_S = self.left
            str_T = self.forward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)

        elif self.action_for_next == 3:  #右转前进
            str_S = self.right
            str_T = self.forward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)

        elif self.action_for_next == 4:  #后退
            str_S = self.middle
            str_T = self.backward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)
            str_S = self.middle
            str_T = self.stop
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)
            str_S = self.middle
            str_T = self.backward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)

        elif self.action_for_next == 5:  #左转后退
            str_S = self.left
            str_T = self.backward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)
            str_S = self.left
            str_T = self.stop
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            str_S = self.left
            str_T = self.backward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")

        elif self.action_for_next == 6:  #右转后退
            str_S = self.right
            str_T = self.backward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)
            str_S = self.right
            str_T = self.stop
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)
            str_S = self.right
            str_T = self.backward
            str_S = str_S.encode("utf-8")
            str_T = str_T.encode("utf-8")
            socket_tcp.send(str_S)
            socket_tcp.send(str_T)

    def get_one_car(self, x1, y1, x2, y2):
        x0 = (x1 + x2) / 2
        y0 = (y1 + y2) / 2
        detx = x1 - x2
        dety = y1 - y2
        temp_x0 = x0 - self.WIDTH / 2
        temp_y0 = y0 - self.WIDTH / 2
        if detx > 0:
            alpha_small = math.atan(dety / detx)
        elif detx < 0:
            alpha_small = math.atan(dety / detx) + math.pi
        else:
            if dety > 0:
                alpha_small = math.pi / 2
            else:
                alpha_small = 0 - math.pi / 2

        if temp_x0 > 0:
            alpha_big = math.atan(temp_y0 / temp_x0)
        elif temp_x0 < 0:
            alpha_big = math.atan(temp_y0 / temp_x0) + math.pi
        else:
            if temp_y0 > 0:
                alpha_big = math.pi / 2
            else:
                alpha_big = 0 - math.pi / 2

        alpha_small = alpha_small / math.pi - 0.5
        alpha_big = alpha_big / math.pi - 0.5
        r = math.sqrt(temp_x0**2 + temp_y0**2) / self.RANGE
        return {
            "alpha_big": alpha_big,
            "alpha_small": alpha_small,
            "r": r,
            "x0": x0,
            "y0": y0
        }

    def get_finnal_result(self):
        red_x = self.temp_result["red"]["x"]
        red_y = self.temp_result["red"]["y"]
        green_x = self.temp_result["green"]["x"]
        green_y = self.temp_result["green"]["y"]
        blue_x = self.temp_result["blue"]["x"]
        blue_y = self.temp_result["blue"]["y"]
        yellow_x = self.temp_result["yellow"]["x"]
        yellow_y = self.temp_result["yellow"]["y"]
        finnal_temp = {}
        me_temp = self.get_one_car(red_x, red_y, green_x, green_y)
        enemy_temp = self.get_one_car(blue_x, blue_y, yellow_x, yellow_y)
        finnal_temp["me"] = me_temp
        finnal_temp["enemy"] = enemy_temp
        self.finnal_result = finnal_temp

    def draw(self, frame, lowerRGB, upperRGB, word):

        hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
        # 根据阈值构建掩膜
        mask = cv2.inRange(hsv, lowerRGB, upperRGB)
        # 腐蚀操作
        mask = cv2.erode(mask, None, iterations=2)
        # 膨胀操作,其实先腐蚀再膨胀的效果是开运算,去除噪点
        mask = cv2.dilate(mask, None, iterations=2)
        cnts = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL,
                                cv2.CHAIN_APPROX_SIMPLE)[-2]
        # 初始化瓶盖圆形轮廓质心
        center = None
        # 如果存在轮廓
        if len(cnts) > 0:
            # 找到面积最大的轮廓
            c = max(cnts, key=cv2.contourArea)
            # 确定面积最大的轮廓的外接圆
            ((x, y), radius) = cv2.minEnclosingCircle(c)
            # 计算轮廓的矩
            M = cv2.moments(c)
            # 计算质心
            center = (int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"]))
            # 只有当半径大于10时,才执行画图
            if radius > 10:
                cv2.circle(frame, (int(x), int(y)), int(radius), (0, 255, 255),
                           2)
                cv2.circle(frame, center, 5, (0, 0, 255), -1)

                font = cv2.FONT_HERSHEY_SIMPLEX
                cv2.putText(frame, word, (int(x), int(y)), font, 1.2,
                            (255, 255, 255), 2)
                result = {}
                result["x"] = x
                result["y"] = y

                return result

    def streaming(self):

        try:
            print("Host: ", self.host_name + " " + self.host_ip)
            print("Connection from: ", self.client_address)
            print("Streaming...")
            print("Press 'q' to exit")

            redLower = np.array([170, 100, 200])
            redUpper = np.array([179, 255, 255])

            greenLower = np.array([65, 100, 100])
            greenUpper = np.array([85, 255, 255])

            #blueLower = np.array([0, 0, 150])
            #blueUpper = np.array([100, 100, 255])
            blueLower = np.array([95, 100, 100])
            blueUpper = np.array([115, 255, 255])
            yellowLower = np.array([5, 100, 100])
            yellowUpper = np.array([20, 255, 255])
            # need bytes here
            stream_bytes = b" "
            while True:
                stream_bytes += self.connection.read(1024)
                first = stream_bytes.find(b"\xff\xd8")
                last = stream_bytes.find(b"\xff\xd9")
                #str_ = 'S270'
                #str_ = str_.encode("utf-8")
                #socket_tcp.send(str_)

                #f = open('record_' + str(self.count) + '.json', 'w')
                #json.dump(dic_dump, f)
                #f.close()

                if first != -1 and last != -1:
                    jpg = stream_bytes[first:last + 2]
                    stream_bytes = stream_bytes[last + 2:]
                    image = cv2.imdecode(np.frombuffer(jpg, dtype=np.uint8),
                                         cv2.IMREAD_COLOR)
                    frame = image
                    result_red = self.draw(frame, redLower, redUpper, "RED")
                    result_green = self.draw(frame, greenLower, greenUpper,
                                             "GREEN")
                    result_blue = self.draw(frame, blueLower, blueUpper,
                                            "blue")
                    result_yellow = self.draw(frame, yellowLower, yellowUpper,
                                              "YELLOW")
                    result = {}
                    result["red"] = result_red
                    result["green"] = result_green
                    result["blue"] = result_blue
                    result["yellow"] = result_yellow

                    self.temp_result = result
                    flag = True
                    if not result_red:
                        flag = False
                    if not result_green:
                        flag = False
                    if not result_blue:
                        flag = False
                    if not result_yellow:
                        flag = False
                    if flag:
                        self.get_finnal_result()
                        self.time_now = int((time.time() - start_time) * 1000)
                        self.dqn_loop()
                        '''
                        dic_dump = {'data': self.finnal_result, 'time' : self.time_now}
                        f = open('./test_1/record_' + str(self.count) + '.json', 'w')
                        json.dump(dic_dump, f)
                        f.close()
                        self.count +=1
                        '''
                        cv2.line(frame, (int(self.temp_result["red"]["x"]),
                                         int(self.temp_result["red"]["y"])),
                                 (int(self.temp_result["green"]["x"]),
                                  int(self.temp_result["green"]["y"])),
                                 (0, 255, 0), 1, 4)
                        cv2.line(frame, (int(self.temp_result["blue"]["x"]),
                                         int(self.temp_result["blue"]["y"])),
                                 (int(self.temp_result["yellow"]["x"]),
                                  int(self.temp_result["yellow"]["y"])),
                                 (0, 255, 0), 1, 4)
                        cv2.line(frame, (int(self.finnal_result["me"]["x0"]),
                                         int(self.finnal_result["me"]["y0"])),
                                 (int(self.WIDTH / 2), int(self.WIDTH / 2)),
                                 (0, 0, 255), 4, 4)
                        cv2.line(frame,
                                 (int(self.finnal_result["enemy"]["x0"]),
                                  int(self.finnal_result["enemy"]["y0"])),
                                 (int(self.WIDTH / 2), int(self.WIDTH / 2)),
                                 (255, 0, 0), 4, 4)
                        font = cv2.FONT_HERSHEY_SIMPLEX
                        cv2.putText(frame,
                                    str(self.finnal_result["me"]["alpha_big"]),
                                    (int(self.finnal_result["me"]["x0"]),
                                     int(self.finnal_result["me"]["y0"])),
                                    font, 1, (0, 255, 0), 1)
                        cv2.putText(
                            frame,
                            str(self.finnal_result["enemy"]["alpha_small"]),
                            (int(self.finnal_result["enemy"]["x0"]),
                             int(self.finnal_result["enemy"]["y0"])), font, 1,
                            (0, 255, 0), 1)
                    else:
                        str_S = "S360"
                        str_T = "T370"
                        str_S = str_S.encode("utf-8")
                        str_T = str_T.encode("utf-8")
                        socket_tcp.send(str_S)
                        socket_tcp.send(str_T)
                    #print(self.finnal_result)
                    cv2.imshow("Frame", frame)

                    if cv2.waitKey(1) & 0xFF == ord("q"):
                        break
        finally:
            self.connection.close()
            self.server_socket.close()
예제 #9
0
i = []
v = []
r = []
for e in range(EPISODES):
    WH = w.generateWind()
    hdg0_rand = random.choice(hdg0_rand_vec) * TORAD
    hdg0 = hdg0_rand * np.ones(10)

    mdp.simulator.hyst.reset()

    #  We reinitialize the memory of the flow
    state = mdp.initializeMDP(hdg0, WH)
    loss_sim_list = []
    for time in range(80):
        WH = w.generateWind()
        action = agent.act(state)
        next_state, reward = mdp.transition(action, WH)
        agent.remember(
            state, action, reward,
            next_state)  # store the transition + the state flow in the
        # final state !!
        state = next_state
        if len(agent.memory) >= batch_size:
            loss_sim_list.append(agent.replay(batch_size))
            # For data visualisation
            i.append(mdp.s[0, -1])
            v.append(mdp.s[1, -1])
            r.append(mdp.reward)

    loss_over_simulation_time = np.sum(np.array([loss_sim_list])[0]) / len(
        np.array([loss_sim_list])[0])
예제 #10
0
파일: test.py 프로젝트: aaiteam/code_gen
def main():
    print "Creating DQN agent..."

    iters = 10000
    n_goal = 0
    n_goal_all = 0
    time_stamp = 0

    ############################################################
    # print x
    # max_steps = 3
    # actions = ["print", " ", "x"]
    ############################################################

    ############################################################
    # print x+1
    max_steps = 5
    actions = ["print", " ", "x", "+", "1"]
    ############################################################

    agent = DQNAgent(max_steps, actions)
    agent.dqn.initial_exploration = iters * 0.6

    results = []
    policy_frozen = False
    wins_file = "wins.txt"
    with io.FileIO(wins_file, "w") as file:
        file.write("Winning codes:\n")

    for iter in range(iters):
        print "\n\n::{}::".format(iter)

        if iter == 4300:  # 2300:
            policy_frozen = True

        env = gym.make("codegen-v0")
        num = random.randrange(1, 100)
        env.my_input = num

        ############################################################
        # print x
        # env.goal = str(num)
        ############################################################

        ############################################################
        # print x+1
        env.goal = str(num + 1)
        ############################################################

        code = env._reset()
        step_in_episode = 0
        total_score = 0.0
        reward = 0.0
        mystate = []
        my_state_new = []

        while step_in_episode < max_steps:
            state = env.code_index_list[:]
            state += np.zeros([
                max_steps - len(env.code_index_list), agent.dqn.code_idx_size
            ],
                              dtype=int).tolist()

            if step_in_episode == 0:
                action_idx = agent.start(code, state, policy_frozen)
            else:
                action_idx = agent.act(code, state, reward)

            code, reward, terminal, info = env._step(action_idx,
                                                     agent.dqn.actions)
            state_prime = env.code_index_list[:]
            state_prime += np.zeros([
                max_steps - len(env.code_index_list), agent.dqn.code_idx_size
            ],
                                    dtype=int).tolist()

            agent.dqn.experience_replay(agent.dqn.time_stamp)
            if step_in_episode == max_steps - 1 or terminal:
                agent.dqn.stock_experience(agent.dqn.time_stamp, state,
                                           action_idx, reward, state_prime,
                                           True)
                if terminal:
                    agent.dqn.goal_idx.append(agent.dqn.time_stamp)
                agent.dqn.time_stamp += 1
            else:
                agent.dqn.stock_experience(agent.dqn.time_stamp, state,
                                           action_idx, reward, state_prime,
                                           False)

            total_score += reward

            if terminal:
                agent.end(reward)

                n_goal_all += 1
                step_in_episode += 1

                if iters - iter <= 100:
                    n_goal += 1

            step_in_episode += 1

        if iter >= 100:
            results = results[1:]
        if reward >= 1:
            print "WIN"
            results.append(1.0)
            with io.FileIO(wins_file, "a") as f:
                f.write(
                    "\n=====================\n{}\n=====================\n\n".
                    format(code))
                f.flush()
                os.fsync(f)
        else:
            results.append(0.0)
        total_iters = 100 if iter >= 100 else iter + 1
        print "TOTAL {:.2f}% of wins in last {} iters, sum: {}, total good: {}".format(
            100 * sum(results) / total_iters, total_iters, sum(results),
            len(agent.dqn.goal_idx))

        if iter == 1 + agent.dqn.initial_exploration:
            print "n_goal_all = ", n_goal_all
            print agent.dqn.goal_idx
            raw_input()

    print "n_goal : ", n_goal
    print "epsilon : ", agent.epsilon
예제 #11
0
        batch_size = 32

        title = env.symbol.upper() + ' MDP Replay ' + os.path.basename(
            __file__).split('.')[0]
        grapher = Grapher(title)

        with open('./save/losses_' + stock_name + '.txt', 'w') as f:
            for e in range(EPISODES + 1):
                # Train
                state = env.reset()
                state = np.reshape(state, [1, state_size])
                for time in range(500):
                    cash, nown, price = env.holdings[0], env.holdings[
                        1], env.state[-1]
                    # env.render()
                    action = agent.act(state, time)
                    next_state, reward, done, _ = env.step(action)
                    next_state = np.reshape(next_state, [1, state_size])
                    agent.remember(state, action, reward, next_state, done)
                    # agent.train(state, action, reward, next_state, done)
                    if len(agent.memory) > batch_size:
                        agent.replay(batch_size)
                    if e % 2 == 0:
                        #cash, nown, price = state[0, 1], state[0, 2], state[0, -1]
                        # cash, nown, price = *env.holdings, state[0,-1]
                        grapher.add(cash,
                                    nown,
                                    price,
                                    action,
                                    reward,
                                    loss=agent.loss)
예제 #12
0
                 observation=obs,
                 input_shape=[len(obs)],
                 training=True,
                 policy=policy)
agent.compile()

result = []
for episode in range(500):  # 1000エピソード回す
    agent.reset()
    observation = env.reset()  # 環境の初期化
    # observation, _, _, _ = env.step(env.action_space.sample())
    observation = deepcopy(observation)
    agent.observe(observation)
    for t in range(250):  # n回試行する
        # env.render() # 表示
        action = agent.act()
        observation, reward, done, info = env.step(
            action)  # アクションを実行した結果の状態、報酬、ゲームをクリアしたかどうか、その他の情報を返す
        observation = deepcopy(observation)
        agent.observe(observation, reward, done)
        if done:
            break

    # test
    agent.training = False
    observation = env.reset()  # 環境の初期化
    agent.observe(observation)
    for t in range(250):
        # env.render() # 表示
        action = agent.act()
        observation, reward, done, info = env.step(action)
예제 #13
0
class AgentTrainer(object):
    def __init__(self, config):
        # Create session to store trained parameters
        self.session = tf.Session()

        self.action_count = config["action_count"]

        # Create agent for training
        self.agent = DQNAgent(self.action_count)

        # Create memory to store observations
        self.memory = ExperienceMemory(config["replay_memory_size"])

        # Tools for saving and loading networks
        self.saver = tf.train.Saver()

        # Last action that agent performed
        self.last_action_index = None

        # Deque to keep track of average reward and play time
        self.game_history = GameHistory(config["match_memory_size"])

        # Deque to store losses
        self.episode_history = EpisodeHistory(config["replay_memory_size"])

        self.INITIAL_EPSILON = config["initial_epsilon"]
        self.FINAL_EPSILON = config["final_epsilon"]
        self.OBSERVE = config["observe_step_count"]
        self.EXPLORE = config["explore_step_count"]
        self.FRAME_PER_ACTION = config["frame_per_action"]
        self.GAMMA = config["gamma"]
        self.LOG_PERIOD = config["log_period"]
        self.BATCH_SIZE = config["batch_size"]

    def init_training(self):
        # Initialize training parameters
        self.session.run(tf.global_variables_initializer())
        self.epsilon = self.INITIAL_EPSILON
        self.t = 0
        self.last_action_index = None

    def load_model(self, path):
        checkpoint = tf.train.get_checkpoint_state(path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.session, checkpoint.model_checkpoint_path)
            print("Successfully loaded: {}".format(checkpoint.model_checkpoint_path))
        else:
            print("Could not find old network weights")

    def save_model(self, path):
        # Replace with os.path.join
        self.saver.save(self.session, path + "/dqn", global_step=self.t)

    def reset_state(self, initial_state):
        # Get the first state by doing nothing and preprocess the image to 80x80x4
        x_t = initial_state
        x_t = transformImage(x_t)
        self.s_t = np.concatenate((x_t, x_t, x_t, x_t), axis=2)
        self.match_reward = 0
        self.match_playtime = 0
        self.gamma_pow = 1

    def act(self):
        # Choose an action epsilon greedily
        action_index = 0
        if self.t % self.FRAME_PER_ACTION == 0:
            if np.random.random() <= self.epsilon:
                action_index = np.random.randint(0, self.action_count)
            else:
                action_index = self.agent.act(self.session, self.s_t)
        else:
            action_index = self.last_action_index  # do the same thing as before
        self.last_action_index = action_index
        return action_index

    def process_frame(self, screen, reward, terminal):
        if self.last_action_index is None:
            self.reset_state(screen)
            return

        a_t = np.zeros([self.action_count])
        a_t[self.last_action_index] = 1

        # scale down epsilon
        if self.epsilon > self.FINAL_EPSILON and self.t > self.OBSERVE:
            self.epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLORE

        # run the selected action and observe next state and reward
        x_t1, r_t = screen, reward
        x_t1 = transformImage(x_t1)
        s_t1 = np.append(x_t1, self.s_t[:, :, :3], axis=2)

        # store the transition in memory
        self.memory.add_experience((self.s_t, a_t, r_t, s_t1, terminal))

        # only train if done observing
        if self.t > self.OBSERVE:
            loss = self.make_train_step()
            self.episode_history.add_episode(Episode(loss))

        # update the old values
        self.s_t = s_t1
        self.t += 1

        # print info
        if self.t % self.LOG_PERIOD == 0:
            print("TIMESTEP {}, EPSILON {}, EPISODE_STATS {}, MATCH_STATS {}".format(
                self.t,
                self.epsilon,
                self.episode_history.get_average_stats(),
                self.game_history.get_average_stats()))
            sys.stdout.flush()

        self.match_reward += r_t * self.gamma_pow
        self.match_playtime += 1
        self.gamma_pow *= self.GAMMA

        if terminal:
            self.game_history.add_match(MatchResults(
                self.match_reward,
                self.match_playtime,
                reward))
            self.reset_state(screen)

    def make_train_step(self):
        # sample a minibatch to train on
        minibatch = self.memory.sample(self.BATCH_SIZE)

        # get the batch variables
        s_j_batch = [d[0] for d in minibatch]
        a_batch = [d[1] for d in minibatch]
        r_batch = [d[2] for d in minibatch]
        s_j1_batch = [d[3] for d in minibatch]

        # get the batch variables
        # s_j_batch, a_batch, r_batch, s_j1_batch, terminal_batch = zip(*minibatch)
        action_scores_batch = np.array(self.agent.score_actions(self.session, s_j1_batch))
        # r_future = GAMMA * (1 - np.array(terminal_batch)) * np.max(action_scores_batch, axis=1)
        # y_batch = r_batch + r_future

        y_batch = []
        for i in range(0, len(minibatch)):
            # if terminal only equals reward
            if minibatch[i][4]:
                y_batch.append(r_batch[i])
            else:
                y_batch.append(r_batch[i] + self.GAMMA * np.max(action_scores_batch[i]))

        return self.agent.train(self.session, y_batch, a_batch, s_j_batch)