예제 #1
0
def _enjoy():
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    policy.load(filename='ddpg', directory='reinforcement/pytorch/models/')

    obs = env.reset()
    done = False

    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            env.render()
        done = False
        obs = env.reset()
예제 #2
0
def initenv2():
    env = launch_env2()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    return env
def make_env():
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    return env
def _enjoyWindow():
    model = WindowModel(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('./models/windowimitate.pt')
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env1()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()

    obsWindow = np.zeros((12, 160, 120))

    while True:
        obsWindow[:9, :, :] = obsWindow[3:, :, :]
        obsWindow[9:12, :, :] = obs
        obs = torch.from_numpy(obsWindow).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()

        obs, reward, done, info = env.step(action)
        env.render()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)

            obs = env.reset()
            env.render()
def _enjoy():
    model = Model(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('trained_models/imitate.pt',
                                map_location=device)
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()

    while True:
        obs = torch.from_numpy(obs).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()

        obs, reward, done, info = env.step(action)
        env.render()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)

            obs = env.reset()
            env.render()
def extract():

  env = launch_env()
  print("Initialized environment")

  # Wrappers
  env = ResizeWrapper(env)
  env = NormalizeWrapper(env)
  env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
  env = ActionWrapper(env)
  env = DtRewardWrapper(env)
  print("Initialized Wrappers")

  state_dim = env.observation_space.shape
  action_dim = env.action_space.shape[0]
  max_action = float(env.action_space.high[0])

  total_timesteps = 0
  timesteps_since_eval = 0
  episode_num = 0
  done = True
  episode_reward = None
  env_counter = 0
  reward = 0
  episode_timesteps = 0
  obs = env.reset()

  print("Starting training")
  while total_timesteps < 2:
    action = env.action_space.sample()
    # Perform action
    new_obs, reward, done, _ = env.step(action)

    obs = new_obs
    print(obs)
    print(action)
    total_timesteps += 1
예제 #7
0
def initWindowModel():
    model = WindowModel(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('./models/windowimitate.pt')
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env1()

    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    env.reset()
    env.render()

    return env, model
예제 #8
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    replay_buffer = ReplayBuffer(args.replay_buffer_max_size)
    print("Initialized DDPG")

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    episode_reward = None
    env_counter = 0
    reward = 0
    episode_timesteps = 0

    print("Starting training")
    while total_timesteps < args.max_timesteps:

        print("timestep: {} | reward: {}".format(total_timesteps, reward))

        if done:
            if total_timesteps != 0:
                print(
                    ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
                    (total_timesteps, episode_num, episode_timesteps,
                     episode_reward))
                policy.train(replay_buffer, episode_timesteps, args.batch_size,
                             args.discount, args.tau)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval %= args.eval_freq
                    evaluations.append(evaluate_policy(env, policy))
                    print("rewards at time {}: {}".format(
                        total_timesteps, evaluations[-1]))

                    if args.save_models:
                        policy.save(filename='{}_{}'.format(
                            'ddpg', total_timesteps),
                                    directory=args.model_dir)
                    np.savez("./results/rewards.npz", evaluations)

            # Reset environment
            env_counter += 1
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space.shape[0])).clip(
                        env.action_space.low, env.action_space.high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        if episode_timesteps >= args.env_timesteps:
            done = True

        done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float(
            done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, new_obs, action, reward, done_bool)

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    print("Training done, about to save..")
    policy.save(filename='ddpg', directory=args.model_dir)
    print("Finished saving..should return now!")
예제 #9
0
def _train(args):
    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = DtRewardWrapper(env)
    env = MetricsWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    observation_shape = (None, ) + env.observation_space.shape
    action_shape = (None, ) + env.action_space.shape

    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
        env.reset()
    env.close()

    actions = np.array(actions)
    observations = np.array(observations)

    model = Model(action_dim=2, max_action=1.)
    model.train().to(device)

    # weight_decay is L2 regularization, helps avoid overfitting
    optimizer = optim.SGD(model.parameters(), lr=0.0004, weight_decay=1e-3)

    avg_loss = 0
    for epoch in range(args.epochs):
        optimizer.zero_grad()

        batch_indices = np.random.randint(0, observations.shape[0],
                                          (args.batch_size))
        obs_batch = torch.from_numpy(
            observations[batch_indices]).float().to(device)
        act_batch = torch.from_numpy(actions[batch_indices]).float().to(device)

        model_actions = model(obs_batch)

        loss = (model_actions - act_batch).norm(2).mean()
        loss.backward()
        optimizer.step()

        loss = loss.data[0]
        avg_loss = avg_loss * 0.995 + loss * 0.005

        print('epoch %d, loss=%.3f' % (epoch, avg_loss))

        # Periodically save the trained model
        if epoch % 200 == 0:
            torch.save(model.state_dict(),
                       'imitation/pytorch/models/imitate.pt')
def _dagger():
    model = Model(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('./models/imitate.pt')
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env1()
    # Register a keyboard handler

    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()
    env.render()
    key_handler = key.KeyStateHandler()
    env.unwrapped.window.push_handlers(key_handler)

    print(env.map_name)
    raise Exception("asdfsadf")

    obsHistory = []
    actionHistory = []

    while True:
        obs = torch.from_numpy(obs).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()

        obs, reward, done, info = env.step(action)

        print(key_handler)
        daggerAction = np.array([0.0, 0.0])
        if key_handler[key.UP]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([1.00, 0.0])
            #action = np.array([0.44, 0.0])
        if key_handler[key.DOWN]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([-1.00, 0])
            #action = np.array([-0.44, 0])
        if key_handler[key.LEFT]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([0.35, +1])
        if key_handler[key.RIGHT]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([0.35, -1])
        if key_handler[key.SPACE]:
            obsHistoryArray = np.array(obsHistory)
            actionHistoryArray = np.array(actionHistory)
            np.save('./dagger/obs_{}.npy'.format(len(count)), obsHistoryArray)
            np.save('./dagger/actions_{}.npy'.format(len(count)),
                    actionHistoryArray)

        print(daggerAction)
        obsHistory.append(obs)
        actionHistory.append(daggerAction)

        env.render()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)

            obs = env.reset()
            env.render()
예제 #11
0
def _train(args):
    env = launch_env1()
    env1 = ResizeWrapper(env)
    env2 = NormalizeWrapper(env) 
    env3 = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    def transformObs(obs):
        obs = env1.observation(obs)
        obs = env2.observation(obs)
        obs = env3.observation(obs)
        return obs

    actions = None
    rawObs = None
    for map in MAP_NAMES:
        if map == "loop_obstacles":
            episodes = 3
        else:
            episodes = 2

        print(map)
        for episode in range(episodes):
            actionFile = "actions_{}.npy".format(episode)
            action = np.load(TRAINING_DATA_PATH.format(map, actionFile))
            print(action.shape)

            observationFile = "obs_{}.npy".format(episode)
            observation = np.load(TRAINING_DATA_PATH.format(map, observationFile))

            if actions is None:
                actions = action
                rawObs = observation
            else:
                actions = np.concatenate((actions, action), axis=0)
                rawObs = np.concatenate((rawObs, observation), axis=0)
            print(actions.shape)
        print(actions.shape)
        print("---")

    observations = np.zeros((rawObs.shape[0], 3, 160, 120))
    for i, obs in enumerate(rawObs):
        observations[i] = transformObs(obs)

    
    '''
    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, 2):
    #for episode in range(0, args.episodes):
        print("Starting episode", episode)
        #for steps in range(0, args.steps):
        for steps in range(0, 4):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
        env.reset()

    actions = np.array(actions)
    observations = np.array(observations)
    print(observations.shape)
    '''

    env.close()
    #raise Exception("Done with testing")

    model = Model(action_dim=2, max_action=1.)
    model.train().to(device)

    # weight_decay is L2 regularization, helps avoid overfitting
    optimizer = optim.SGD(
        model.parameters(),
        lr=0.0004,
        weight_decay=1e-3
    )

    loss_list = []
    avg_loss = 0
    for epoch in range(args.epochs):
        optimizer.zero_grad()

        batch_indices = np.random.randint(0, observations.shape[0], (args.batch_size))
        obs_batch = torch.from_numpy(observations[batch_indices]).float().to(device)
        act_batch = torch.from_numpy(actions[batch_indices]).long().to(device)

        model_actions = model(obs_batch)

        loss = (model_actions - act_batch).norm(2).mean()
        loss.backward()
        optimizer.step()

        #loss = loss.data[0]
        loss = loss.item()
        avg_loss = avg_loss * 0.995 + loss * 0.005

        print('epoch %d, loss=%.3f' % (epoch, loss))
        loss_list.append(loss)

        # Periodically save the trained model
        if epoch % 50 == 0:
            print("Saving...")
            torch.save(model.state_dict(), 'imitation/pytorch/models/imitate.pt')
            save_loss(loss_list, 'imitation/pytorch/loss.npy')

    print("Saving...")
    torch.save(model.state_dict(), 'imitation/pytorch/models/imitate.pt')