예제 #1
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
                        map_name='MoveToBeacon',
                        agent_interface_format=sc2_env.parse_agent_interface_format(
                            feature_screen=64,
                            feature_minimap=64,
                            rgb_screen=None,
                            rgb_minimap=None,
                            action_space=None,
                            use_feature_units=False),
                        step_mul=step_mul,
                        game_steps_per_episode=None,
                        disable_fog=False,
                    visualize=True) as env:
        sess = tf.Session()
        actor = Actor(sess, n_features=2, n_actions=4, lr=0.001)
        critic = Critic(sess, n_features=2, lr=0.001)
        sess.run(tf.global_variables_initializer())
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100,obs)
                obs = env.step(actions=[actions])
            state = np.array(obs2state(obs))
            #print('episode start')
            global_step = 0
            reward = 0
            while not done: 
                global_step += 1
                action = actor.choose_action(state)
                actions = actAgent2Pysc2(action,obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = np.array(obs2state(obs))
                reward = -(distance-pre_distance)*400
                
                if distance < 0.03 or global_step == 200:   # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True
                
                td_error = critic.learn(state, reward, next_state)
                actor.learn(state, action, td_error)

                if distance < 0.03 or global_step == 200:   # 게임 종료시
                    break
                state = next_state
                pre_distance = distance
예제 #2
0
파일: test_policy.py 프로젝트: 170928/gail
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name='MoveToBeacon',
            agent_interface_format=sc2_env.parse_agent_interface_format(
                feature_screen=64,
                feature_minimap=64,
                rgb_screen=None,
                rgb_minimap=None,
                action_space=None,
                use_feature_units=False),
            step_mul=step_mul,
            game_steps_per_episode=None,
            disable_fog=False,
            visualize=True) as env:
        with tf.Session() as sess:
            Policy = Policy_net('policy', 2, 4)
            Old_Policy = Policy_net('old_policy', 2, 4)
            PPO = PPOTrain(Policy, Old_Policy)
            D = Discriminator()
            saver = tf.train.Saver()
            saver.restore(sess, './model/gail.cpkt')
            c = 0
            for episodes in range(100000):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation.available_actions:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                observations = []
                actions_list = []
                rewards = []
                v_preds = []
                reward = 0
                global_step = 0
                while not done:
                    global_step += 1
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    observations.append(state)
                    actions_list.append(act)
                    rewards.append(reward)
                    v_preds.append(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])
                    next_state = obs2state(obs)
                    distance = obs2distance(obs)
                    if distance < 0.03 or global_step == 100:
                        done = True
                    if done:
                        v_preds_next = v_preds[1:] + [0]
                        break
                    state = next_state
예제 #3
0
def worker(remote, visualize):
    env = sc2_env.SC2Env(
        map_name='MoveToBeacon',
        agent_interface_format=sc2_env.parse_agent_interface_format(
            feature_screen=32,
            feature_minimap=32,
            rgb_screen=None,
            rgb_minimap=None,
            action_space=None,
            use_feature_units=False),
        step_mul=4,
        game_steps_per_episode=None,
        disable_fog=False,
        visualize=False)
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 400
        if cmd == 'step':
            if not action == 'done':
                available_action, state = obs
                a = actAgent2Pysc2(action, state)
                obs = env.step(actions=[a])
                distance = obs2distance(obs)
                state = obs2state(obs)
                obs = (obs[0].observation.available_actions, state)

                #reward shaping

                reward = -0.1
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    if global_step == end_step - 1:
                        reward = -1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            pre_num_mineral = 20
            done = False
            obs = env.reset()  #env 초기화
            while not 331 in obs[0].observation.available_actions:  #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            obs = (obs[0].observation.available_actions, state)
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
예제 #4
0
def worker(remote, visualize):
    env =  sc2_env.SC2Env(
                        map_name='MoveToBeacon',
                        agent_interface_format=sc2_env.parse_agent_interface_format(
                            feature_screen=64,
                            feature_minimap=64,
                            rgb_screen=None,
                            rgb_minimap=None,
                            action_space=None,
                            use_feature_units=False),
                        step_mul=4,
                        game_steps_per_episode=None,
                        disable_fog=False,
                    visualize=True)
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 100
        if cmd == 'step':
            if not action == 'done':
                #while not 331 in obs[0].observation['available_actions']:   #마린을 선택하기
                #    actions = actAgent2Pysc2(100, obs)
                #    obs = env.step(actions=[actions])
                a = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[a])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                distance = obs2distance(obs)
                reward = -0.1
                obs = obs[0].observation.feature_screen.base[4]
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            done = False
            obs = env.reset()          #env 초기화
            while not 331 in obs[0].observation['available_actions']:   #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            obs = obs[0].observation.feature_screen.base[4]
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
예제 #5
0
def test():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        sess = tf.Session()
        mainDQN = DQN(sess, 2, 4, name='main')
        targetDQN = DQN(sess, 2, 4, name='target')
        #sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess, './Move2Beacon(DQN)/model.cpkt')
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            print('episode start')
            global_step = 0
            random_rate = 0
            e = 1. / ((episodes / 10) + 1)
            reward = 0
            while not done:
                time.sleep(0.13)
                global_step += 1

                action = np.argmax(mainDQN.predict(state))
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = obs2state(obs)
                reward = -(distance - pre_distance) * 400
                if distance < 0.015 or global_step == 200:  # 게임 종료시
                    done = True

                if distance < 0.015 or global_step == 200:  # 게임 종료시
                    print(reward, episodes, random_rate / global_step)
                    break
                state = next_state
                pre_distance = distance
예제 #6
0
def worker(remote, visualize):
    env = sc2_env.SC2Env(map_name='MoveToBeacon',
                         step_mul=4,
                         visualize=visualize,
                         screen_size_px=(64, 64),
                         minimap_size_px=(64, 64))
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 100
        if cmd == 'step':
            if not action == 'done':
                while not 331 in obs[0].observation[
                        'available_actions']:  #마린을 선택하기
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                a = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[a])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                distance = obs2distance(obs)
                reward = -0.01
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    if global_step == end_step - 1:
                        reward = -1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            done = False
            obs = env.reset()  #env 초기화
            while not 331 in obs[0].observation['available_actions']:  #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
예제 #7
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        sess = tf.Session()
        actor = Actor(sess, n_features=2, n_actions=4, lr=0.001)
        critic = Critic(sess, n_features=2, lr=0.001)
        sess.run(tf.global_variables_initializer())
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = np.array(obs2state(obs))
            print('episode start')
            global_step = 0
            reward = 0
            while not done:
                global_step += 1
                time.sleep(0.2)
                action = actor.choose_action(state)
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = np.array(obs2state(obs))
                reward = -(distance - pre_distance) * 400

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True

                td_error = critic.learn(state, reward, next_state)
                actor.learn(state, action, td_error)

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    break
                state = next_state
                pre_distance = distance
예제 #8
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon",
                        step_mul=step_mul,
                        screen_size_px=(16, 16),
                        minimap_size_px=(16, 16)) as env:
        Policy = Policy_net('policy', 16 * 16 * 2, 4)
        Old_Policy = Policy_net('old_policy', 16 * 16 * 2, 4)
        PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for episodes in range(EPISODES):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation["available_actions"]:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = np.array(obs2state(obs))
                print('episode start')
                global_step = 0
                reward = 0

                observations = []
                actions_list = []
                v_preds = []
                rewards = []

                while not done:
                    global_step += 1
                    time.sleep(0.05)

                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])

                    for i in range(1):
                        actions = no_operation(obs)
                        obs = env.step(actions=[actions])
                    distance = obs2distance(obs)
                    if global_step == 1:
                        pre_distance = distance
                    next_state = np.array(obs2state(obs))
                    reward = -10 * (distance - pre_distance)
                    #if reward < 0 :
                    #    reward = -0.01
                    #if reward <= 0:
                    #    reward = 0
                    #elif reward > 0:
                    #    reward = 0
                    reward = -0.01
                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        if distance < 0.03:
                            reward = 1
                        if global_step == 200:
                            reward = -1
                        done = True

                    observations.append(state)
                    actions_list.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        v_preds_next = v_preds[1:] + [0]
                        gaes = PPO.get_gaes(rewards=rewards,
                                            v_preds=v_preds,
                                            v_preds_next=v_preds_next)
                        observations = np.reshape(observations,
                                                  newshape=[-1, 16 * 16 * 2])
                        actions = np.array(actions_list).astype(dtype=np.int32)
                        rewards = np.array(rewards).astype(dtype=np.float32)
                        v_preds_next = np.array(v_preds_next).astype(
                            dtype=np.float32)
                        gaes = np.array(gaes).astype(dtype=np.float32)
                        gaes = (gaes - gaes.mean())

                        PPO.assign_policy_parameters()

                        inp = [
                            observations, actions, rewards, v_preds_next, gaes
                        ]
                        for epoch in range(5):
                            sample_indices = np.random.randint(
                                low=0, high=observations.shape[0],
                                size=64)  # indices are in [low, high)
                            sampled_inp = [
                                np.take(a=a, indices=sample_indices, axis=0)
                                for a in inp
                            ]  # sample training data
                            PPO.train(obs=sampled_inp[0],
                                      actions=sampled_inp[1],
                                      rewards=sampled_inp[2],
                                      v_preds_next=sampled_inp[3],
                                      gaes=sampled_inp[4])
                        print(episodes, global_step)
                        break
                    state = next_state
                    pre_distance = distance
예제 #9
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name='MoveToBeacon',
            agent_interface_format=sc2_env.parse_agent_interface_format(
                feature_screen=64,
                feature_minimap=64,
                rgb_screen=None,
                rgb_minimap=None,
                action_space=None,
                use_feature_units=False),
            step_mul=step_mul,
            game_steps_per_episode=None,
            disable_fog=False,
            visualize=False) as env:
        r = tf.placeholder(tf.float32)  ########
        rr = tf.summary.scalar('reward', r)
        merged = tf.summary.merge_all()  ########
        expert_observations = np.genfromtxt('trajectory/observations.csv')
        expert_actions = np.genfromtxt('trajectory/actions.csv',
                                       dtype=np.int32)
        with tf.Session() as sess:
            Policy = Policy_net('policy', 2, 4)
            Old_Policy = Policy_net('old_policy', 2, 4)
            PPO = PPOTrain(Policy, Old_Policy)
            D = Discriminator()
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            writer = tf.summary.FileWriter('./board/gail',
                                           sess.graph)  ########
            c = 0
            for episodes in range(100000):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation.available_actions:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                observations = []
                actions_list = []
                rewards = []
                v_preds = []
                reward = 0
                global_step = 0
                while not done:
                    global_step += 1
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    observations.append(state)
                    actions_list.append(act)
                    rewards.append(reward)
                    v_preds.append(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])
                    next_state = obs2state(obs)
                    distance = obs2distance(obs)
                    if distance < 0.03 or global_step == 100:
                        done = True
                    if done:
                        v_preds_next = v_preds[1:] + [0]
                        break
                    state = next_state
                observations = np.reshape(observations, newshape=[-1, 2])
                actions_list = np.array(actions_list).astype(dtype=np.int32)
                for i in range(2):
                    sample_indices = (np.random.randint(
                        expert_observations.shape[0],
                        size=observations.shape[0]))
                    inp = [expert_observations, expert_actions]
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    D.train(expert_s=sampled_inp[0],
                            expert_a=sampled_inp[1],
                            agent_s=observations,
                            agent_a=actions_list)
                d_rewards = D.get_rewards(agent_s=observations,
                                          agent_a=actions_list)
                d_rewards = np.reshape(d_rewards,
                                       newshape=[-1]).astype(dtype=np.float32)

                gaes = PPO.get_gaes(rewards=d_rewards,
                                    v_preds=v_preds,
                                    v_preds_next=v_preds_next)
                gaes = np.array(gaes).astype(dtype=np.float32)
                v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

                inp = [
                    observations, actions_list, gaes, d_rewards, v_preds_next
                ]
                PPO.assign_policy_parameters()
                for epoch in range(15):
                    sample_indices = np.random.randint(
                        low=0, high=observations.shape[0],
                        size=32)  # indices are in [low, high)
                    sampled_inp = [
                        np.take(a=a, indices=sample_indices, axis=0)
                        for a in inp
                    ]  # sample training data
                    PPO.train(obs=sampled_inp[0],
                              actions=sampled_inp[1],
                              gaes=sampled_inp[2],
                              rewards=sampled_inp[3],
                              v_preds_next=sampled_inp[4])
                summary = sess.run(merged, feed_dict={r: global_step})
                writer.add_summary(summary, episodes)
                if global_step < 50:
                    c += 1
                else:
                    c = 0
                if c > 10:
                    saver.save(sess, './model/gail.cpkt')
                    print('save model')
                    break
                print(episodes, global_step, c)
예제 #10
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        replay_buffer = deque(maxlen=1000)
        sess = tf.Session()
        mainDQN = DQN(sess, 2, 4, name='main')
        targetDQN = DQN(sess, 2, 4, name='target')
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        #saver.restore(sess, './Move2Beacon/model.cpkt')
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            print('episode start')
            global_step = 0
            random_rate = 0
            e = 1. / ((episodes / 10) + 1)
            reward = 0
            while not done:
                global_step += 1
                time.sleep(0.05)
                if np.random.rand() < e:
                    random_rate += 1
                    action = random.randrange(4)
                else:
                    action = np.argmax(mainDQN.predict(state))
                #action = np.argmax(mainDQN.predict(state))
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = obs2state(obs)
                reward = -(distance - pre_distance) * 400
                #print(reward)
                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True

                #print(next_state, reward)
                replay_buffer.append((state, action, reward, next_state, done))

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if len(replay_buffer) > BATCH_SIZE:
                        minibatch = random.sample(replay_buffer, BATCH_SIZE)
                        loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                        sess.run(copy_ops)
                        print('model trained')
                        saver.save(sess, './Move2Beacon/model.cpkt')
                    print(reward, episodes, random_rate / global_step)
                    break
                state = next_state
                pre_distance = distance