示例#1
0
def _enjoy():
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    policy.load(filename='ddpg', directory='reinforcement/pytorch/models/')

    obs = env.reset()
    done = False

    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            env.render()
        done = False
        obs = env.reset()
示例#2
0
def _train(args):
    print("Running Expert for {} Episodes of {} Steps".format(args.episodes, args.steps))
    print("Training Learning for {} Epochs with Batch Size of {}".format(args.epochs, args.batch_size))

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env) 
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    observation_shape = (None, ) + env.observation_space.shape
    action_shape = (None, ) + env.action_space.shape

    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
            
        env.reset()

    env.close()

    actions = np.array(actions)
    observations = np.array(observations)

    model = TensorflowModel(
        observation_shape=observation_shape,  # from the logs we've got
        action_shape=action_shape,  # same
        graph_location=args.model_directory,  # where do we want to store our trained models
        seed=args.seed  # to seed all random operations in the model (e.g., dropout)
    )

    for i in range(args.epochs):
        # we defined the batch size, this can be adjusted according to your computing resources
        loss = None
        for batch in range(0, len(observations), args.batch_size):
            print("Training batch", batch)
            loss = model.train(
                observations=observations[batch:batch + args.batch_size],
                actions=actions[batch:batch + args.batch_size]
            )

        # every 10 epochs, we store the model we have
        if i % 10 == 0:
            model.commit()

    print("Training complete!")
def make_env():
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    return env
def launch_local_experiment(gym_environment=None):
    # Use our environment launcher
    env = launch_env(gym_environment)

    # If you want to use a wrapper, just import it!
    # from utils.wrappers import ResizeWrapper
    # env = ResizeWrapper(env)

    observation = env.reset()
    # While there are no signal of completion (simulation done)
    # we run the predictions for a number of episodes, don't worry, we have the control on this part

    # We need to launch the ROS stuff in the background
    # ROSLaunch API doesn't play well with our environment setup, so we use subprocess
    import subprocess
    subprocess.Popen(["roslaunch lf_slim.launch"], shell=True)

    # Start the ROSAgent, which handles publishing images and subscribing to action
    agent = ROSAgent()
    r = rospy.Rate(15)

    while not rospy.is_shutdown():
        # we passe the observation to our model, and we get an action in return
        # we tell the environment to perform this action and we get some info back in OpenAI Gym style

        # To trigger the lane following pipeline, we publish the image
        # and camera_infos to the correct topics defined in rosagent
        agent._publish_img(observation)
        agent._publish_info()

        # The action is updated inside of agent by other nodes asynchronously
        action = agent.action
        # Edge case - if the nodes aren't ready yet
        if np.array_equal(action, np.array([0, 0])):
            continue

        observation, reward, done, info = env.step(action)
        # here you may want to compute some stats, like how much reward are you getting
        # notice, this reward may no be associated with the challenge score.

        # it is important to check for this flag, the Evalution Engine will let us know when should we finish
        # if we are not careful with this the Evaluation Engine will kill our container and we will get no score
        # from this submission
        if 'simulation_done' in info:
            break
        if done:
            env.reset()

        # Run the main loop at 15Hz
        r.sleep()
def _enjoy():
    model = Model(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('trained_models/imitate.pt',
                                map_location=device)
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()

    while True:
        obs = torch.from_numpy(obs).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()

        obs, reward, done, info = env.step(action)
        env.render()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)

            obs = env.reset()
            env.render()
def extract():

  env = launch_env()
  print("Initialized environment")

  # Wrappers
  env = ResizeWrapper(env)
  env = NormalizeWrapper(env)
  env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
  env = ActionWrapper(env)
  env = DtRewardWrapper(env)
  print("Initialized Wrappers")

  state_dim = env.observation_space.shape
  action_dim = env.action_space.shape[0]
  max_action = float(env.action_space.high[0])

  total_timesteps = 0
  timesteps_since_eval = 0
  episode_num = 0
  done = True
  episode_reward = None
  env_counter = 0
  reward = 0
  episode_timesteps = 0
  obs = env.reset()

  print("Starting training")
  while total_timesteps < 2:
    action = env.action_space.sample()
    # Perform action
    new_obs, reward, done, _ = env.step(action)

    obs = new_obs
    print(obs)
    print(action)
    total_timesteps += 1
def _enjoy(args):
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = GrayscaleWrapper(env)
    env = NormalizeWrapper(env)
    env = FrameStack(env, 4)
    env = DtRewardWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    # policy = TD3(state_dim, action_dim, max_action, net_type="cnn")
    # policy.load(filename=args.policy, directory='reinforcement/pytorch/models/')

    policy = policies[args.policy](state_dim, action_dim, max_action)
    policy.load("reinforcement/pytorch/models/", args.policy)

    obs = env.reset()
    done = False

    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            env.render()
        done = False
        obs = env.reset()
示例#8
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    replay_buffer = ReplayBuffer(args.replay_buffer_max_size)
    print("Initialized DDPG")

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    episode_reward = None
    env_counter = 0
    reward = 0
    episode_timesteps = 0

    print("Starting training")
    while total_timesteps < args.max_timesteps:

        print("timestep: {} | reward: {}".format(total_timesteps, reward))

        if done:
            if total_timesteps != 0:
                print(
                    ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
                    (total_timesteps, episode_num, episode_timesteps,
                     episode_reward))
                policy.train(replay_buffer, episode_timesteps, args.batch_size,
                             args.discount, args.tau)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval %= args.eval_freq
                    evaluations.append(evaluate_policy(env, policy))
                    print("rewards at time {}: {}".format(
                        total_timesteps, evaluations[-1]))

                    if args.save_models:
                        policy.save(filename='{}_{}'.format(
                            'ddpg', total_timesteps),
                                    directory=args.model_dir)
                    np.savez("./results/rewards.npz", evaluations)

            # Reset environment
            env_counter += 1
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space.shape[0])).clip(
                        env.action_space.low, env.action_space.high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        if episode_timesteps >= args.env_timesteps:
            done = True

        done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float(
            done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, new_obs, action, reward, done_bool)

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    print("Training done, about to save..")
    policy.save(filename='ddpg', directory=args.model_dir)
    print("Finished saving..should return now!")
示例#9
0
def _train(args):
    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = DtRewardWrapper(env)
    env = MetricsWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    observation_shape = (None, ) + env.observation_space.shape
    action_shape = (None, ) + env.action_space.shape

    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
        env.reset()
    env.close()

    actions = np.array(actions)
    observations = np.array(observations)

    model = Model(action_dim=2, max_action=1.)
    model.train().to(device)

    # weight_decay is L2 regularization, helps avoid overfitting
    optimizer = optim.SGD(model.parameters(), lr=0.0004, weight_decay=1e-3)

    avg_loss = 0
    for epoch in range(args.epochs):
        optimizer.zero_grad()

        batch_indices = np.random.randint(0, observations.shape[0],
                                          (args.batch_size))
        obs_batch = torch.from_numpy(
            observations[batch_indices]).float().to(device)
        act_batch = torch.from_numpy(actions[batch_indices]).float().to(device)

        model_actions = model(obs_batch)

        loss = (model_actions - act_batch).norm(2).mean()
        loss.backward()
        optimizer.step()

        loss = loss.data[0]
        avg_loss = avg_loss * 0.995 + loss * 0.005

        print('epoch %d, loss=%.3f' % (epoch, avg_loss))

        # Periodically save the trained model
        if epoch % 200 == 0:
            torch.save(model.state_dict(),
                       'imitation/pytorch/models/imitate.pt')
示例#10
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = GrayscaleWrapper(env)
    env = NormalizeWrapper(env)
    env = FrameStack(env, 4)
    env = DtRewardWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Init training data
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    episode_reward = 0
    env_counter = 0
    reward = 0
    episode_timesteps = 0

    avg_episodes = 100

    # Keep track of the best reward over time
    best_reward = -np.inf

    # Keep track of train_rewards
    train_rewards = []

    # To print mean actions per episode
    mean_action = []

    # To keep track of moving averages
    moving_avgs = []

    # Summary writer for tensorboard
    writer = SummaryWriter(log_dir="reinforcement/pytorch/runs")

    # Initialize policy
    if args.policy not in policies:
        raise ValueError(
            "Policy {} is not available, chose one of : {}".format(
                args.policy, list(policies.keys())))

    policy = policies[args.policy](state_dim, action_dim, max_action, args.per,
                                   args.gradclip)

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]
    moving_avgs.append(evaluations[0])

    writer.add_scalar("Timesteps/EvaluationReward", evaluations[0],
                      total_timesteps)

    ## Initialize ReplayBuffer
    if args.per:
        print("Training with Prioritized Experience Reply")
        replay_buffer = PrioritizedReplayBuffer(
            args.replay_buffer_max_size,
            args.batch_size,
            args.seed,
            initial_beta=0.5,
            delta_beta=2 / args.max_timesteps,
        )
    else:
        replay_buffer = ReplayBuffer(args.replay_buffer_max_size,
                                     args.batch_size, args.seed)

    # Load previous policy
    if args.load_initial_policy:

        # Disable random start steps
        args.start_timesteps = 0

        # Load training data
        checkpoint = load_training_state(args.model_dir,
                                         args.policy + "_training")

        evaluations = checkpoint["evaluations"]
        total_timesteps = checkpoint["total_timesteps"]
        train_rewards = checkpoint["train_rewards"]
        episode_num = checkpoint["episode_num"]
        best_reward = checkpoint["best_reward"]
        moving_avgs = checkpoint["moving_avgs"]

        # Load policy
        policy.load(args.model_dir, args.policy)

    print("Starting training")

    obs = env.reset()

    while total_timesteps < args.max_timesteps:

        # Select action
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            action = add_noise(action, args.expl_noise, env.action_space.low,
                               env.action_space.high)

        mean_action.append(action)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        # Update episode reward
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, action, reward, new_obs, float(done))

        # Update network
        if len(replay_buffer) >= args.batch_size:
            policy.update(replay_buffer, args.discount, args.tau)

        # Update env
        obs = new_obs
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

        if episode_timesteps >= args.env_timesteps:
            done = True

        if done:
            print((
                "Total T: %d Episode Num: %d \nMean actions: %.2f %.2f Episode T: %d Reward: %.1f Moving Average: %.1f"
            ) % (
                total_timesteps,
                episode_num,
                np.mean(np.array(mean_action), axis=0)[0],
                np.mean(np.array(mean_action), axis=0)[1],
                episode_timesteps,
                episode_reward,
                moving_avgs[-1],
            ))

            train_rewards.append(episode_reward)
            moving_avgs.append(moving_average(train_rewards, avg_episodes))

            writer.add_scalar("Timesteps/Rewards", episode_reward,
                              total_timesteps)
            writer.add_scalar("Timesteps/MovingAverage", moving_avgs[-1],
                              total_timesteps)
            # writer.add_scalar("Episode/Wheel1Mean", np.mean(np.array(mean_action), axis=0)[0], episode_num)
            # writer.add_scalar("Episode/Wheel2Mean", np.mean(np.array(mean_action), axis=0)[1], episode_num)

            # Evaluate episode
            if timesteps_since_eval >= args.eval_freq:

                timesteps_since_eval %= args.eval_freq
                eval_reward = evaluate_policy(env, policy)
                evaluations.append(eval_reward)

                writer.add_scalar("Timesteps/EvaluationReward", eval_reward,
                                  total_timesteps)

                print(
                    "\n-+-+-+-+-+-+-+-+-+-+ Evaluation reward at time {}: {} +-+-+-+-+-+-+-+-+-+-"
                    .format(total_timesteps, eval_reward))

                np.savetxt(
                    "reinforcement/pytorch/results/eval_rewards_" +
                    args.policy + ".csv",
                    np.array(evaluations),
                    delimiter=",",
                )
                np.savetxt(
                    "reinforcement/pytorch/results/train_rewards_" +
                    args.policy + ".csv",
                    np.array(train_rewards),
                    delimiter=",",
                )
                np.savetxt(
                    "reinforcement/pytorch/results/moving_averages_" +
                    args.policy + ".csv",
                    np.array(moving_avgs),
                    delimiter=",",
                )

                # Save the policy according to the best reward over training
                if eval_reward > best_reward:
                    best_reward = eval_reward
                    policy.save(args.model_dir, args.policy)
                    save_training_state(
                        args.model_dir,
                        args.policy + "_training",
                        best_reward,
                        total_timesteps,
                        evaluations,
                        train_rewards,
                        episode_num,
                        moving_avgs,
                    )

                    print(
                        "-+-+-+-+-+-+-+-+-+-+ Model saved +-+-+-+-+-+-+-+-+-+-\n"
                    )

            # Reset environment
            mean_action = []

            obs = env.reset()
            env_counter += 1
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

    print("Finished..should return now!")