import gym
import ppo
import collections

from tensorboardX import SummaryWriter

writer = SummaryWriter('from_base')

score = 0
episode = 0
p = 0
train_flag = 0

sess = tf.Session()

agent = ppo.PPO(sess, 4, 2)

env = gym.make('CartPole-v0')

state = env.reset()

score_board = collections.deque(maxlen=10)
expert_state_action = []

while True:

    values_list, states_list, actions_list, dones_list, logp_ts_list, rewards_list = \
                [], [], [], [], [], []

    for _ in range(128):
Пример #2
0
    env = gym.make(ENV)

    with tf.Session() as sess:
        training = None
        if "-t" in sys.argv:
            training = True
        else:
            training = False

        actor = ppo.PPO(4,
                        2,
                        gamma=0.90,
                        lam=0.90,
                        clip_param=0.1,
                        horizon=5,
                        optim_epoch=4,
                        lr=0.0001,
                        lr_decay=0.001,
                        storage=10,
                        value_coefficient=1,
                        entropy_coefficient=0.001,
                        training=training,
                        continous=False)

        saver = tf.train.Saver()
        if "-n" in sys.argv:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, "model/cartpole")
            print("Restored...")

        try:
Пример #3
0
lambd = 0.95
epsilon = 0.1
num_epochs = 3
num_actors = 4
T_horizon = 51
batch_size = 64
num_iters = 1000
print_interval = 20

if __name__ == "__main__":
    env = ChessEnv(
        r"/home/keerthan/Softwares/stockfish-10-linux/Linux/stockfish_10_x64")
    model = ppo.PPO(env.state_dimension,
                    env.action_dimension,
                    learning_rate=learning_rate,
                    gamma=gamma,
                    lambd=lambd,
                    epsilon=epsilon,
                    num_epochs=num_epochs,
                    batch_size=batch_size)

    #copy global variables to local
    iters = num_iters
    _print_interval = print_interval
    horizon = T_horizon
    score = 0
    render = False
    renormalize = True
    # run the algorithm
    for i in range(1, iters + 1):
        start_state = env.reset()
        avg_score = 0
Пример #4
0
import env
import time
import random
import ppo
import numpy as np
import logging
import matplotlib.pyplot as plt
import naive_controller

my_env = env.TrafficEnv()
#bt.saver.restore(bt.sess,"./model/my_light_model/my-model.ckpt-3500")
all_ep_r = []
myppo = ppo.PPO()

myppo.saver.restore(myppo.sess,"./model/my-model2.ckpt-2000")
myNaiveCon = naive_controller.NaiveCon()

for i_episode in range(1000000):
    # listener()
    s,rawOcc = my_env.reset()
    s = np.concatenate([s[0], np.reshape(s[1] + s[2], -1)])

    buffer_s, buffer_a, buffer_r = [], [], []

    k = 0
    ep_r = 0
    mydict = ["go","stop","left","right","nothing"]
    while True:
        a = myppo.choose_action(s)
        #a = myNaiveCon.gen_action(s,rawOcc)
Пример #5
0
if __name__ == "__main__":

    env = gym.make(ENV)

    with tf.Session() as sess:
        training = None
        if "-n" in sys.argv:
            training = True
        else:
            training = False

        actor = ppo.PPO(3,
                        1,
                        gamma=0.95,
                        clip_param=0.2,
                        horizon=32,
                        optim_epoch=4,
                        lr_decay=0.001,
                        storage=32,
                        training=training,
                        continous=True)

        saver = tf.train.Saver()
        if "-n" in sys.argv:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, "model/pendelum")
            print("Restored...")

        try:
            if "-p" in sys.argv:
                print("Playing...")
            discountFactor = d.get('discountFactor')
            explorationRate = d.get('explorationRate')
            learnStart = d.get('learnStart')
            memorySize = d.get('memorySize')
            current_epoch = d.get('current_epoch')
            stepCounter = d.get('stepCounter')
            loadsim_seconds = d.get('loadsim_seconds')

        clear_monitor_files(outdir)
        copy_tree(monitor_path, outdir)
        env = gym.wrappers.Monitor(env, outdir, resume=True)
    ppo = ppo.PPO(S_DIM=S_DIM,
                  A_DIM=A_DIM,
                  EP_MAX=epochs,
                  EP_LEN=episode_steps,
                  GAMMA=discountFactor,
                  A_LR=A_learningRate,
                  C_LR=C_learningRate,
                  BATCH=minibatch_size,
                  propeller_hovering_speed=0.0)
    last100Rewards = [0] * 100
    last100RewardsIndex = 0
    last100Filled = False
    all_ep_r = []

    start_time = time.time()

    # start iterating from 'current epoch'.
    for epoch in range(current_epoch + 1, epochs + 1, 1):
        observation = env.reset()
        cumulated_reward = 0
import collections
import discriminator

from tensorboardX import SummaryWriter

data = np.load('expert_data.npy')
expert_data = collections.deque()
for x in data:
    expert_data.append(x)

sess = tf.Session()

state_size = 4
action_size = 2
n_step = 128
agent = ppo.PPO(sess, state_size, action_size)
dis = discriminator.Discriminator(sess, state_size, action_size)

env = gym.make('CartPole-v0')
score = 0
episode = 0
p = 0
gail = True

writer = SummaryWriter()

state = env.reset()

while True:
    values_list, states_list, actions_list, dones_list, logp_ts_list, rewards_list = \
                [], [], [], [], [], []
                           force=True)
    render = True
    start_state = env.reset()
    actor_history, actor_score, start_state, done = ppo.generate_trajectory(
        env, model, start_state, 500, render)
    env.close()
    env_to_wrap.close()


if __name__ == "__main__":
    env = gym.make("CartPole-v1")
    s = env.reset()
    model = ppo.PPO(s.shape[0],
                    2,
                    learning_rate=learning_rate,
                    gamma=gamma,
                    lambd=lambd,
                    epsilon=epsilon,
                    num_epochs=num_epochs,
                    batch_size=batch_size)

    store_output(model, '0')

    train(env, model, 250)
    env.close()

    store_output(model, '250')

    env = gym.make("CartPole-v1")
    train(env, model, 250)
    env.close()
Пример #9
0
from nav_wrapper import NavigationEnv
import ppo
import models_ppo as models
import numpy as np
import os
import rl_eval

batch_size = 64
eval_eps = 50
rl_core = ppo.PPO(
    model = [models.PolicyNet, models.ValueNet],
    learning_rate = [0.0001, 0.0001],
    reward_decay = 0.99,
    batch_size = 1)

is_train = True
render = True
load_model = False
'''
is_train = False
render = True
load_model = True
'''
map_path = "Maps/map.png"
gif_path = "out/"
model_path = "save/"
if not os.path.exists(model_path):
    os.makedirs(model_path)

if load_model:
    print("Load model ...", model_path)