示例#1
0
def main():
    visualize_n_boards = 5
    for i in range(visualize_n_boards):
        env = ppad.PAD(board=boards[i])
        env.episode2gif(path=os.environ['PYTHONPATH'] +
                        '/visualizations/solved_board' + str(i + 1) + '.png',
                        shrink=8,
                        ext='png')
示例#2
0
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>
"""

from importlib import reload
import ppad
ppad = reload(ppad)
from ppad.pad.utils import episode2gif

SOMEPATH = 'yourpath'

# Example 1: Visualize directly from the environment itself.
env = ppad.PAD()
for _ in range(100):
    env.step(action=env.action_space.sample(), verbose=True)
env.visualize(filename=SOMEPATH + '/random_sampling.gif')

env.step(action='pass', verbose=True)

# Example 2: Visualize using the episode information.
# Generate observations and actions using any method in the specified format.
# Here we are generating them from "smart data" and step = -1 means terminate on zero combo.
observations, actions, rewards = ppad.smart_data(boards=1,
                                                 permutations=1,
                                                 trajectories=1,
                                                 steps=-1)
episode2gif(observations, actions, filename=SOMEPATH + '/smart_data.gif')
示例#3
0
def smart_data(boards=1,
               permutations=1,
               trajectories=1,
               steps=100,
               discount=True,
               gamma=0.9,
               log10=True,
               allowed_orbs=solved_boards.allowed_orbs):
    """
    Generate smart training data in a format that can be directly fed into the learning agent.
    The generation 1 of smart training data is derived from human-solved boards and random sampling.
    :param boards: The number of boards out of the solved boards to randomly use.
    :param permutations: The number of orb identity permutations to perform for each board.
    :param trajectories: The number of trajectories to generate from each permutation of each chosen board.
    :param steps: The number of steps to generate in each trajectory. If steps = -1, terminates the trajectories when
           and only when there is no more combos on the board.
    :param discount: True for doing discounting.
    :param gamma: Discount rate.
    :param allowed_orbs: A list of allowed orb identities.
    :return: observations, actions and rewards defined exactly as the same-named variables in ppad.pad.game.
    """
    observations_sd = []
    actions_sd = []
    rewards_sd = []
    env = ppad.PAD(skyfall_damage=False)

    if boards < 0 or boards > len(solved_boards.boards):
        raise Exception('Invalid input value for board = {0}.'.format(boards))
    if trajectories < 0:
        raise Exception(
            'Invalid input value for traj = {0}.'.format(trajectories))
    if permutations < 0:
        raise Exception(
            'Invalid input value for shuffle = {0}.'.format(permutations))

    board_indices = random.sample(range(0, len(solved_boards.boards)), boards)
    for index in board_indices:
        current_board = solved_boards.boards[index]
        for _ in range(permutations):
            # The permutations generated this way are not unique.
            current_permutation = random.sample(allowed_orbs,
                                                len(allowed_orbs))
            current_board = permutation_mapping(
                original_board=current_board,
                original_orbs=solved_boards.allowed_orbs,
                mapping=current_permutation)
            for _ in range(trajectories):
                env.reset(board=current_board)
                env.step('pass')
                final_reward = env.rewards[-1]
                env.reset(board=current_board)
                if steps != -1:
                    for _ in range(steps - 1):
                        action = env.action_space.sample()
                        env.step(action)
                elif steps == -1:
                    # When steps is -1, we reverse sample the board until the first time no combo is left.
                    combos = [0]
                    while len(combos) > 0:
                        action = env.action_space.sample()
                        env.step(action)
                        combos = pad_utils.cancel(np.copy(env.board))
                env.step('pass')
                observations_sd.append(revert_observations(env.observations))
                actions_sd.append(revert_actions(env.actions))
                rewards_sd.append(
                    revert_rewards(len(env.actions), final_reward))

    if discount:
        discounted_rewards_list = []
        for rewards_one_traj in rewards_sd:
            discounted_rewards_list.append(
                ppad.discount(rewards=rewards_one_traj,
                              gamma=gamma,
                              log10=log10))
        rewards_sd = discounted_rewards_list

    return observations_sd, actions_sd, rewards_sd
示例#4
0
ID2ACTION = {0: 'up', 1: 'down', 2: 'left', 3: 'right', 4: 'pass'}
NON_PASS_ACTIONS = {'up', 'down', 'left', 'right'}


############################
# 2. Set-up.
############################

# Agent initialization.
sess = tf.Session()
agent = Agent01(sess, conv_layers=((2, 128), (2, 64)),
                dense_layers=(64, 32, 5), learning_rate=0.0001)
agent.copy_A_to_B()

# Environment initialization.
env = ppad.PAD(skyfall_damage=False)

# (s,a,r) tuples.
sar_data = []

# Metrics variables.
beta = BETA_INIT
print('BETA value at the end of training:', BETA_INIT*BETA_INCREASE_RATE**(STEPS/BETA_INCREASE_FREQ))
total_loss = 0
total_rmse = 0
total_reward = 0
total_new_data_points = 0
total_actions = 0
total_episodes = 0
max_reward = 0