Exemplo n.º 1
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
                        map_name='MoveToBeacon',
                        agent_interface_format=sc2_env.parse_agent_interface_format(
                            feature_screen=64,
                            feature_minimap=64,
                            rgb_screen=None,
                            rgb_minimap=None,
                            action_space=None,
                            use_feature_units=False),
                        step_mul=step_mul,
                        game_steps_per_episode=None,
                        disable_fog=False,
                    visualize=True) as env:
        sess = tf.Session()
        actor = Actor(sess, n_features=2, n_actions=4, lr=0.001)
        critic = Critic(sess, n_features=2, lr=0.001)
        sess.run(tf.global_variables_initializer())
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100,obs)
                obs = env.step(actions=[actions])
            state = np.array(obs2state(obs))
            #print('episode start')
            global_step = 0
            reward = 0
            while not done: 
                global_step += 1
                action = actor.choose_action(state)
                actions = actAgent2Pysc2(action,obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = np.array(obs2state(obs))
                reward = -(distance-pre_distance)*400
                
                if distance < 0.03 or global_step == 200:   # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True
                
                td_error = critic.learn(state, reward, next_state)
                actor.learn(state, action, td_error)

                if distance < 0.03 or global_step == 200:   # 게임 종료시
                    break
                state = next_state
                pre_distance = distance
Exemplo n.º 2
0
def worker(remote, visualize):
    env =  sc2_env.SC2Env(
                        map_name='MoveToBeacon',
                        agent_interface_format=sc2_env.parse_agent_interface_format(
                            feature_screen=64,
                            feature_minimap=64,
                            rgb_screen=None,
                            rgb_minimap=None,
                            action_space=None,
                            use_feature_units=False),
                        step_mul=4,
                        game_steps_per_episode=None,
                        disable_fog=False,
                    visualize=True)
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 100
        if cmd == 'step':
            if not action == 'done':
                #while not 331 in obs[0].observation['available_actions']:   #마린을 선택하기
                #    actions = actAgent2Pysc2(100, obs)
                #    obs = env.step(actions=[actions])
                a = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[a])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                distance = obs2distance(obs)
                reward = -0.1
                obs = obs[0].observation.feature_screen.base[4]
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            done = False
            obs = env.reset()          #env 초기화
            while not 331 in obs[0].observation['available_actions']:   #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            obs = obs[0].observation.feature_screen.base[4]
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
Exemplo n.º 3
0
def test():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        sess = tf.Session()
        mainDQN = DQN(sess, 2, 4, name='main')
        targetDQN = DQN(sess, 2, 4, name='target')
        #sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess, './Move2Beacon(DQN)/model.cpkt')
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            print('episode start')
            global_step = 0
            random_rate = 0
            e = 1. / ((episodes / 10) + 1)
            reward = 0
            while not done:
                time.sleep(0.13)
                global_step += 1

                action = np.argmax(mainDQN.predict(state))
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = obs2state(obs)
                reward = -(distance - pre_distance) * 400
                if distance < 0.015 or global_step == 200:  # 게임 종료시
                    done = True

                if distance < 0.015 or global_step == 200:  # 게임 종료시
                    print(reward, episodes, random_rate / global_step)
                    break
                state = next_state
                pre_distance = distance
Exemplo n.º 4
0
def worker(remote, visualize):
    env = sc2_env.SC2Env(map_name='MoveToBeacon',
                         step_mul=4,
                         visualize=visualize,
                         screen_size_px=(64, 64),
                         minimap_size_px=(64, 64))
    done = False
    while True:
        cmd, action, obs, global_step = remote.recv()
        end_step = 100
        if cmd == 'step':
            if not action == 'done':
                while not 331 in obs[0].observation[
                        'available_actions']:  #마린을 선택하기
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                a = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[a])
                for i in range(1):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                state = obs2state(obs)
                distance = obs2distance(obs)
                reward = -0.01
                if distance < 0.03 or global_step == end_step - 1:
                    if distance < 0.03:
                        reward = 1
                    if global_step == end_step - 1:
                        reward = -1
                    done = True
                remote.send((obs, state, action, reward, done))
            else:
                remote.send((0, 0, 0, 0, True))

        if cmd == 'reset':
            done = False
            obs = env.reset()  #env 초기화
            while not 331 in obs[0].observation['available_actions']:  #마린을 선택하기
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            remote.send((obs, state, 0, 0, False))

        if cmd == 'close':
            remote.close()
            break
Exemplo n.º 5
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        sess = tf.Session()
        actor = Actor(sess, n_features=2, n_actions=4, lr=0.001)
        critic = Critic(sess, n_features=2, lr=0.001)
        sess.run(tf.global_variables_initializer())
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = np.array(obs2state(obs))
            print('episode start')
            global_step = 0
            reward = 0
            while not done:
                global_step += 1
                time.sleep(0.2)
                action = actor.choose_action(state)
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = np.array(obs2state(obs))
                reward = -(distance - pre_distance) * 400

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True

                td_error = critic.learn(state, reward, next_state)
                actor.learn(state, action, td_error)

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    break
                state = next_state
                pre_distance = distance
Exemplo n.º 6
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon",
                        step_mul=step_mul,
                        screen_size_px=(16, 16),
                        minimap_size_px=(16, 16)) as env:
        Policy = Policy_net('policy', 16 * 16 * 2, 4)
        Old_Policy = Policy_net('old_policy', 16 * 16 * 2, 4)
        PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for episodes in range(EPISODES):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation["available_actions"]:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                state = np.array(obs2state(obs))
                print('episode start')
                global_step = 0
                reward = 0

                observations = []
                actions_list = []
                v_preds = []
                rewards = []

                while not done:
                    global_step += 1
                    time.sleep(0.05)

                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    obs = env.step(actions=[actions])

                    for i in range(1):
                        actions = no_operation(obs)
                        obs = env.step(actions=[actions])
                    distance = obs2distance(obs)
                    if global_step == 1:
                        pre_distance = distance
                    next_state = np.array(obs2state(obs))
                    reward = -10 * (distance - pre_distance)
                    #if reward < 0 :
                    #    reward = -0.01
                    #if reward <= 0:
                    #    reward = 0
                    #elif reward > 0:
                    #    reward = 0
                    reward = -0.01
                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        if distance < 0.03:
                            reward = 1
                        if global_step == 200:
                            reward = -1
                        done = True

                    observations.append(state)
                    actions_list.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    if distance < 0.03 or global_step == 100:  # 게임 종료시
                        v_preds_next = v_preds[1:] + [0]
                        gaes = PPO.get_gaes(rewards=rewards,
                                            v_preds=v_preds,
                                            v_preds_next=v_preds_next)
                        observations = np.reshape(observations,
                                                  newshape=[-1, 16 * 16 * 2])
                        actions = np.array(actions_list).astype(dtype=np.int32)
                        rewards = np.array(rewards).astype(dtype=np.float32)
                        v_preds_next = np.array(v_preds_next).astype(
                            dtype=np.float32)
                        gaes = np.array(gaes).astype(dtype=np.float32)
                        gaes = (gaes - gaes.mean())

                        PPO.assign_policy_parameters()

                        inp = [
                            observations, actions, rewards, v_preds_next, gaes
                        ]
                        for epoch in range(5):
                            sample_indices = np.random.randint(
                                low=0, high=observations.shape[0],
                                size=64)  # indices are in [low, high)
                            sampled_inp = [
                                np.take(a=a, indices=sample_indices, axis=0)
                                for a in inp
                            ]  # sample training data
                            PPO.train(obs=sampled_inp[0],
                                      actions=sampled_inp[1],
                                      rewards=sampled_inp[2],
                                      v_preds_next=sampled_inp[3],
                                      gaes=sampled_inp[4])
                        print(episodes, global_step)
                        break
                    state = next_state
                    pre_distance = distance
Exemplo n.º 7
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="MoveToBeacon", step_mul=step_mul) as env:
        replay_buffer = deque(maxlen=1000)
        sess = tf.Session()
        mainDQN = DQN(sess, 2, 4, name='main')
        targetDQN = DQN(sess, 2, 4, name='target')
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        #saver.restore(sess, './Move2Beacon/model.cpkt')
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episodes in range(EPISODES):
            done = False
            obs = env.reset()
            while not 331 in obs[0].observation["available_actions"]:
                actions = actAgent2Pysc2(100, obs)
                obs = env.step(actions=[actions])
            state = obs2state(obs)
            print('episode start')
            global_step = 0
            random_rate = 0
            e = 1. / ((episodes / 10) + 1)
            reward = 0
            while not done:
                global_step += 1
                time.sleep(0.05)
                if np.random.rand() < e:
                    random_rate += 1
                    action = random.randrange(4)
                else:
                    action = np.argmax(mainDQN.predict(state))
                #action = np.argmax(mainDQN.predict(state))
                actions = actAgent2Pysc2(action, obs)
                obs = env.step(actions=[actions])
                for i in range(3):
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])
                distance = obs2distance(obs)
                if global_step == 1:
                    pre_distance = distance
                next_state = obs2state(obs)
                reward = -(distance - pre_distance) * 400
                #print(reward)
                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if distance < 0.03:
                        reward = 10
                    if global_step == 200:
                        reward = -10
                    done = True

                #print(next_state, reward)
                replay_buffer.append((state, action, reward, next_state, done))

                if distance < 0.03 or global_step == 200:  # 게임 종료시
                    if len(replay_buffer) > BATCH_SIZE:
                        minibatch = random.sample(replay_buffer, BATCH_SIZE)
                        loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                        sess.run(copy_ops)
                        print('model trained')
                        saver.save(sess, './Move2Beacon/model.cpkt')
                    print(reward, episodes, random_rate / global_step)
                    break
                state = next_state
                pre_distance = distance
Exemplo n.º 8
0
def train():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="CollectMineralShards", step_mul=step_mul,
                        screen_size_px=(32, 32), minimap_size_px=(32, 32)) as env:
        Policy = Policy_net('policy', 32*32*2, 4)
        Old_Policy = Policy_net('old_policy', 32*32*2, 4)
        PPO = PPOTrain(Policy, Old_Policy, gamma=0.95)
        saver = tf.train.Saver()
        with tf.Session() as sess:
            print('a')
            saver.restore(sess, './model/model.ckpt')
            print('a')
            #sess.run(tf.global_variables_initializer())
            for episodes in range(EPISODES):
                done = False
                obs = env.reset()
                while not 331 in obs[0].observation["available_actions"]:
                    actions = actAgent2Pysc2(100, obs)
                    obs = env.step(actions=[actions])
                actions = gather(obs)
                obs = env.step(actions=[actions])
                end_step = 200
                global_step = 0
                score = 0
                reward = 0
                for i in range(100):
                    time.sleep(0.01)
                    actions = no_operation(obs)
                    obs = env.step(actions=[actions])       
                state = obs2state(obs)
                observations = []
                actions_list = []
                v_preds = []
                rewards = []

                print('episode start')
                while not done:
                    global_step += 1
                    time.sleep(0.05)
                    state = np.stack([state]).astype(dtype=np.float32)
                    act, v_pred = Policy.act(obs=state, stochastic=True)
                    act, v_pred = np.asscalar(act), np.asscalar(v_pred)
                    actions = actAgent2Pysc2(act, obs)
                    #while not 331 in obs[0].observation["available_actions"]:
                    #    actions = actAgent2Pysc2(100, obs)
                    #    obs = env.step(actions=[actions])
                    obs = env.step(actions=[actions])
                    
                    if global_step == end_step or obs2done(obs) >= 1900 :    # 게임 time을 다 사용하거나 미네랄을 다 먹었을 경우 게임이 끝난다.
                        done = True
                    
                    next_state = obs2state(obs)
                    reward = obs[0].reward

                    if reward == 0:
                        reward = -0.1

                    if done:
                        if obs2done(obs) >= 1900:   # 게임이 종료되었는데 미네랄을 다 먹었으면
                            reward = 3
                        else:                       # 게임이 종료되었는데 미네랄을 다 못먹으면
                            reward = -3   

                    score += reward

                    observations.append(state)
                    actions_list.append(act)
                    v_preds.append(v_pred)
                    rewards.append(reward)

                    if done:   # 게임 종료시
                        v_preds_next = v_preds[1:] + [0]
                        gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)
                        observations = np.reshape(observations, newshape=[-1, 32*32*2])
                        actions = np.array(actions_list).astype(dtype=np.int32)
                        rewards = np.array(rewards).astype(np.float32)
                        v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
                        gaes = np.array(gaes).astype(dtype=np.float32)
                        gaes = (gaes - gaes.mean())
                        PPO.assign_policy_parameters()
                        inp = [observations, actions, rewards, v_preds_next, gaes]
                        for epoch in range(5):
                            sample_indices = np.random.randint(low=0, high=observations.shape[0], size=64)
                            sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                            PPO.train(obs=sampled_inp[0],
                                    actions=sampled_inp[1],
                                    rewards=sampled_inp[2],
                                    v_preds_next=sampled_inp[3],
                                    gaes=sampled_inp[4])
                        print(episodes, score)
                        save_path = saver.save(sess, './model/model.ckpt')
                        if episodes == 0:
                            f = open('test2.csv', 'w', encoding='utf-8', newline='')
                        else:
                            f = open('test2.csv', 'a', encoding='utf-8', newline='')
                        wr = csv.writer(f)
                        wr.writerow([episodes, score])
                        f.close()
                        break
                    state = next_state