Пример #1
0
def _train(args):
    print("Running Expert for {} Episodes of {} Steps".format(args.episodes, args.steps))
    print("Training Learning for {} Epochs with Batch Size of {}".format(args.epochs, args.batch_size))

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env) 
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    observation_shape = (None, ) + env.observation_space.shape
    action_shape = (None, ) + env.action_space.shape

    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
            
        env.reset()

    env.close()

    actions = np.array(actions)
    observations = np.array(observations)

    model = TensorflowModel(
        observation_shape=observation_shape,  # from the logs we've got
        action_shape=action_shape,  # same
        graph_location=args.model_directory,  # where do we want to store our trained models
        seed=args.seed  # to seed all random operations in the model (e.g., dropout)
    )

    for i in range(args.epochs):
        # we defined the batch size, this can be adjusted according to your computing resources
        loss = None
        for batch in range(0, len(observations), args.batch_size):
            print("Training batch", batch)
            loss = model.train(
                observations=observations[batch:batch + args.batch_size],
                actions=actions[batch:batch + args.batch_size]
            )

        # every 10 epochs, we store the model we have
        if i % 10 == 0:
            model.commit()

    print("Training complete!")
Пример #2
0
def initenv2():
    env = launch_env2()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    return env
def make_env():
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    return env
Пример #4
0
def initWindowModel():
    model = WindowModel(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('./models/windowimitate.pt')
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env1()

    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    env.reset()
    env.render()

    return env, model
def extract():

  env = launch_env()
  print("Initialized environment")

  # Wrappers
  env = ResizeWrapper(env)
  env = NormalizeWrapper(env)
  env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
  env = ActionWrapper(env)
  env = DtRewardWrapper(env)
  print("Initialized Wrappers")

  state_dim = env.observation_space.shape
  action_dim = env.action_space.shape[0]
  max_action = float(env.action_space.high[0])

  total_timesteps = 0
  timesteps_since_eval = 0
  episode_num = 0
  done = True
  episode_reward = None
  env_counter = 0
  reward = 0
  episode_timesteps = 0
  obs = env.reset()

  print("Starting training")
  while total_timesteps < 2:
    action = env.action_space.sample()
    # Perform action
    new_obs, reward, done, _ = env.step(action)

    obs = new_obs
    print(obs)
    print(action)
    total_timesteps += 1
Пример #6
0
def _enjoy():
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    policy.load(filename='ddpg', directory='reinforcement/pytorch/models/')

    obs = env.reset()

    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            env.render()
        done = False
        obs = env.reset()
Пример #7
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    replay_buffer = ReplayBuffer(args.replay_buffer_max_size)
    print("Initialized DDPG")

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    episode_reward = None
    env_counter = 0
    reward = 0
    episode_timesteps = 0

    print("Starting training")
    while total_timesteps < args.max_timesteps:

        print("timestep: {} | reward: {}".format(total_timesteps, reward))

        if done:
            if total_timesteps != 0:
                print(
                    ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
                    (total_timesteps, episode_num, episode_timesteps,
                     episode_reward))
                policy.train(replay_buffer, episode_timesteps, args.batch_size,
                             args.discount, args.tau)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval %= args.eval_freq
                    evaluations.append(evaluate_policy(env, policy))
                    print("rewards at time {}: {}".format(
                        total_timesteps, evaluations[-1]))

                    if args.save_models:
                        policy.save(filename='{}_{}'.format(
                            'ddpg', total_timesteps),
                                    directory=args.model_dir)
                    np.savez("./results/rewards.npz", evaluations)

            # Reset environment
            env_counter += 1
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space.shape[0])).clip(
                        env.action_space.low, env.action_space.high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        if episode_timesteps >= args.env_timesteps:
            done = True

        done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float(
            done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, new_obs, action, reward, done_bool)

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    print("Training done, about to save..")
    policy.save(filename='ddpg', directory=args.model_dir)
    print("Finished saving..should return now!")
Пример #8
0
def _train(args):
    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = DtRewardWrapper(env)
    env = MetricsWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    observation_shape = (None, ) + env.observation_space.shape
    action_shape = (None, ) + env.action_space.shape

    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
        env.reset()
    env.close()

    actions = np.array(actions)
    observations = np.array(observations)

    model = Model(action_dim=2, max_action=1.)
    model.train().to(device)

    # weight_decay is L2 regularization, helps avoid overfitting
    optimizer = optim.SGD(model.parameters(), lr=0.0004, weight_decay=1e-3)

    avg_loss = 0
    for epoch in range(args.epochs):
        optimizer.zero_grad()

        batch_indices = np.random.randint(0, observations.shape[0],
                                          (args.batch_size))
        obs_batch = torch.from_numpy(
            observations[batch_indices]).float().to(device)
        act_batch = torch.from_numpy(actions[batch_indices]).float().to(device)

        model_actions = model(obs_batch)

        loss = (model_actions - act_batch).norm(2).mean()
        loss.backward()
        optimizer.step()

        loss = loss.data[0]
        avg_loss = avg_loss * 0.995 + loss * 0.005

        print('epoch %d, loss=%.3f' % (epoch, avg_loss))

        # Periodically save the trained model
        if epoch % 200 == 0:
            torch.save(model.state_dict(),
                       'imitation/pytorch/models/imitate.pt')
def _enjoy():
    model = Model(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('trained_models/imitate.pt',
                                map_location=device)
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()

    while True:
        obs = torch.from_numpy(obs).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()

        obs, reward, done, info = env.step(action)
        env.render()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)

            obs = env.reset()
            env.render()
def _enjoyWindow():
    model = WindowModel(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('./models/windowimitate.pt')
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env1()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()

    obsWindow = np.zeros((12, 160, 120))

    while True:
        obsWindow[:9, :, :] = obsWindow[3:, :, :]
        obsWindow[9:12, :, :] = obs
        obs = torch.from_numpy(obsWindow).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()

        obs, reward, done, info = env.step(action)
        env.render()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)

            obs = env.reset()
            env.render()
def _dagger():
    model = Model(action_dim=2, max_action=1.)

    try:
        state_dict = torch.load('./models/imitate.pt')
        model.load_state_dict(state_dict)
    except:
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env1()
    # Register a keyboard handler

    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()
    env.render()
    key_handler = key.KeyStateHandler()
    env.unwrapped.window.push_handlers(key_handler)

    print(env.map_name)
    raise Exception("asdfsadf")

    obsHistory = []
    actionHistory = []

    while True:
        obs = torch.from_numpy(obs).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()

        obs, reward, done, info = env.step(action)

        print(key_handler)
        daggerAction = np.array([0.0, 0.0])
        if key_handler[key.UP]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([1.00, 0.0])
            #action = np.array([0.44, 0.0])
        if key_handler[key.DOWN]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([-1.00, 0])
            #action = np.array([-0.44, 0])
        if key_handler[key.LEFT]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([0.35, +1])
        if key_handler[key.RIGHT]:
            print("as===as=df=sad=f=asdf=sad=fs=adf")
            daggerAction = np.array([0.35, -1])
        if key_handler[key.SPACE]:
            obsHistoryArray = np.array(obsHistory)
            actionHistoryArray = np.array(actionHistory)
            np.save('./dagger/obs_{}.npy'.format(len(count)), obsHistoryArray)
            np.save('./dagger/actions_{}.npy'.format(len(count)),
                    actionHistoryArray)

        print(daggerAction)
        obsHistory.append(obs)
        actionHistory.append(daggerAction)

        env.render()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)

            obs = env.reset()
            env.render()
Пример #12
0
def _train(args):
    env = launch_env1()
    env1 = ResizeWrapper(env)
    env2 = NormalizeWrapper(env) 
    env3 = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    def transformObs(obs):
        obs = env1.observation(obs)
        obs = env2.observation(obs)
        obs = env3.observation(obs)
        return obs

    actions = None
    rawObs = None
    for map in MAP_NAMES:
        if map == "loop_obstacles":
            episodes = 3
        else:
            episodes = 2

        print(map)
        for episode in range(episodes):
            actionFile = "actions_{}.npy".format(episode)
            action = np.load(TRAINING_DATA_PATH.format(map, actionFile))
            print(action.shape)

            observationFile = "obs_{}.npy".format(episode)
            observation = np.load(TRAINING_DATA_PATH.format(map, observationFile))

            if actions is None:
                actions = action
                rawObs = observation
            else:
                actions = np.concatenate((actions, action), axis=0)
                rawObs = np.concatenate((rawObs, observation), axis=0)
            print(actions.shape)
        print(actions.shape)
        print("---")

    observations = np.zeros((rawObs.shape[0], 3, 160, 120))
    for i, obs in enumerate(rawObs):
        observations[i] = transformObs(obs)

    
    '''
    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, 2):
    #for episode in range(0, args.episodes):
        print("Starting episode", episode)
        #for steps in range(0, args.steps):
        for steps in range(0, 4):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
        env.reset()

    actions = np.array(actions)
    observations = np.array(observations)
    print(observations.shape)
    '''

    env.close()
    #raise Exception("Done with testing")

    model = Model(action_dim=2, max_action=1.)
    model.train().to(device)

    # weight_decay is L2 regularization, helps avoid overfitting
    optimizer = optim.SGD(
        model.parameters(),
        lr=0.0004,
        weight_decay=1e-3
    )

    loss_list = []
    avg_loss = 0
    for epoch in range(args.epochs):
        optimizer.zero_grad()

        batch_indices = np.random.randint(0, observations.shape[0], (args.batch_size))
        obs_batch = torch.from_numpy(observations[batch_indices]).float().to(device)
        act_batch = torch.from_numpy(actions[batch_indices]).long().to(device)

        model_actions = model(obs_batch)

        loss = (model_actions - act_batch).norm(2).mean()
        loss.backward()
        optimizer.step()

        #loss = loss.data[0]
        loss = loss.item()
        avg_loss = avg_loss * 0.995 + loss * 0.005

        print('epoch %d, loss=%.3f' % (epoch, loss))
        loss_list.append(loss)

        # Periodically save the trained model
        if epoch % 50 == 0:
            print("Saving...")
            torch.save(model.state_dict(), 'imitation/pytorch/models/imitate.pt')
            save_loss(loss_list, 'imitation/pytorch/loss.npy')

    print("Saving...")
    torch.save(model.state_dict(), 'imitation/pytorch/models/imitate.pt')
def _enjoy(args):
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = GrayscaleWrapper(env)
    env = NormalizeWrapper(env)
    env = FrameStack(env, 4)
    env = DtRewardWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    # policy = TD3(state_dim, action_dim, max_action, net_type="cnn")
    # policy.load(filename=args.policy, directory='reinforcement/pytorch/models/')

    policy = policies[args.policy](state_dim, action_dim, max_action)
    policy.load("reinforcement/pytorch/models/", args.policy)

    obs = env.reset()
    done = False

    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            env.render()
        done = False
        obs = env.reset()
Пример #14
0
def _enjoy():
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(
        env, canny=False)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    policy.load(filename='ddpg', directory='reinforcement/pytorch/models/')

    obs = env.reset()
    done = False

    i = 0
    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            print(action, reward)
            env.render()
            time.sleep(0.1)
        print("\nRESET\n")
        done = False
        obs = env.reset()
        np.save('env{}.npz'.format(i), obs)
        i += 1
Пример #15
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = GrayscaleWrapper(env)
    env = NormalizeWrapper(env)
    env = FrameStack(env, 4)
    env = DtRewardWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Init training data
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    episode_reward = 0
    env_counter = 0
    reward = 0
    episode_timesteps = 0

    avg_episodes = 100

    # Keep track of the best reward over time
    best_reward = -np.inf

    # Keep track of train_rewards
    train_rewards = []

    # To print mean actions per episode
    mean_action = []

    # To keep track of moving averages
    moving_avgs = []

    # Summary writer for tensorboard
    writer = SummaryWriter(log_dir="reinforcement/pytorch/runs")

    # Initialize policy
    if args.policy not in policies:
        raise ValueError(
            "Policy {} is not available, chose one of : {}".format(
                args.policy, list(policies.keys())))

    policy = policies[args.policy](state_dim, action_dim, max_action, args.per,
                                   args.gradclip)

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]
    moving_avgs.append(evaluations[0])

    writer.add_scalar("Timesteps/EvaluationReward", evaluations[0],
                      total_timesteps)

    ## Initialize ReplayBuffer
    if args.per:
        print("Training with Prioritized Experience Reply")
        replay_buffer = PrioritizedReplayBuffer(
            args.replay_buffer_max_size,
            args.batch_size,
            args.seed,
            initial_beta=0.5,
            delta_beta=2 / args.max_timesteps,
        )
    else:
        replay_buffer = ReplayBuffer(args.replay_buffer_max_size,
                                     args.batch_size, args.seed)

    # Load previous policy
    if args.load_initial_policy:

        # Disable random start steps
        args.start_timesteps = 0

        # Load training data
        checkpoint = load_training_state(args.model_dir,
                                         args.policy + "_training")

        evaluations = checkpoint["evaluations"]
        total_timesteps = checkpoint["total_timesteps"]
        train_rewards = checkpoint["train_rewards"]
        episode_num = checkpoint["episode_num"]
        best_reward = checkpoint["best_reward"]
        moving_avgs = checkpoint["moving_avgs"]

        # Load policy
        policy.load(args.model_dir, args.policy)

    print("Starting training")

    obs = env.reset()

    while total_timesteps < args.max_timesteps:

        # Select action
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            action = add_noise(action, args.expl_noise, env.action_space.low,
                               env.action_space.high)

        mean_action.append(action)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        # Update episode reward
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, action, reward, new_obs, float(done))

        # Update network
        if len(replay_buffer) >= args.batch_size:
            policy.update(replay_buffer, args.discount, args.tau)

        # Update env
        obs = new_obs
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

        if episode_timesteps >= args.env_timesteps:
            done = True

        if done:
            print((
                "Total T: %d Episode Num: %d \nMean actions: %.2f %.2f Episode T: %d Reward: %.1f Moving Average: %.1f"
            ) % (
                total_timesteps,
                episode_num,
                np.mean(np.array(mean_action), axis=0)[0],
                np.mean(np.array(mean_action), axis=0)[1],
                episode_timesteps,
                episode_reward,
                moving_avgs[-1],
            ))

            train_rewards.append(episode_reward)
            moving_avgs.append(moving_average(train_rewards, avg_episodes))

            writer.add_scalar("Timesteps/Rewards", episode_reward,
                              total_timesteps)
            writer.add_scalar("Timesteps/MovingAverage", moving_avgs[-1],
                              total_timesteps)
            # writer.add_scalar("Episode/Wheel1Mean", np.mean(np.array(mean_action), axis=0)[0], episode_num)
            # writer.add_scalar("Episode/Wheel2Mean", np.mean(np.array(mean_action), axis=0)[1], episode_num)

            # Evaluate episode
            if timesteps_since_eval >= args.eval_freq:

                timesteps_since_eval %= args.eval_freq
                eval_reward = evaluate_policy(env, policy)
                evaluations.append(eval_reward)

                writer.add_scalar("Timesteps/EvaluationReward", eval_reward,
                                  total_timesteps)

                print(
                    "\n-+-+-+-+-+-+-+-+-+-+ Evaluation reward at time {}: {} +-+-+-+-+-+-+-+-+-+-"
                    .format(total_timesteps, eval_reward))

                np.savetxt(
                    "reinforcement/pytorch/results/eval_rewards_" +
                    args.policy + ".csv",
                    np.array(evaluations),
                    delimiter=",",
                )
                np.savetxt(
                    "reinforcement/pytorch/results/train_rewards_" +
                    args.policy + ".csv",
                    np.array(train_rewards),
                    delimiter=",",
                )
                np.savetxt(
                    "reinforcement/pytorch/results/moving_averages_" +
                    args.policy + ".csv",
                    np.array(moving_avgs),
                    delimiter=",",
                )

                # Save the policy according to the best reward over training
                if eval_reward > best_reward:
                    best_reward = eval_reward
                    policy.save(args.model_dir, args.policy)
                    save_training_state(
                        args.model_dir,
                        args.policy + "_training",
                        best_reward,
                        total_timesteps,
                        evaluations,
                        train_rewards,
                        episode_num,
                        moving_avgs,
                    )

                    print(
                        "-+-+-+-+-+-+-+-+-+-+ Model saved +-+-+-+-+-+-+-+-+-+-\n"
                    )

            # Reset environment
            mean_action = []

            obs = env.reset()
            env_counter += 1
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

    print("Finished..should return now!")