Exemplo n.º 1
0
def main():
    global subdata
    t_start = time.time()

    parser = argparse.ArgumentParser(description='PyTorch X-job')
    parser.add_argument('--env_name',
                        default="OurEnv-v0",
                        help='name of the environment')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        help='discount factor for model (default: 0.001)')
    parser.add_argument('--ou_noise', type=bool, default=True)
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.4,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.4)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=33,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    parser.add_argument('--seed',
                        type=int,
                        default=4,
                        metavar='N',
                        help='random seed (default: 4)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=512,
                        metavar='N',
                        help='batch size (default: 512)')
    parser.add_argument('--num_steps',
                        type=int,
                        default=300,
                        metavar='N',
                        help='max episode length (default: 1000)')
    parser.add_argument('--num_episodes',
                        type=int,
                        default=50,
                        metavar='N',
                        help='number of episodes (default: 1000)')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='hidden size (default: 128)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--save_agent',
                        type=bool,
                        default=True,
                        help='save model to file')
    parser.add_argument('--load_agent',
                        type=bool,
                        default=False,
                        help='load model from file')
    parser.add_argument('--train_model',
                        type=bool,
                        default=True,
                        help='Training or run')
    parser.add_argument('--load_exp',
                        type=bool,
                        default=False,
                        help='load saved experience')
    parser.add_argument('--state_plot',
                        type=bool,
                        default=True,
                        help='plot Q values for environment')
    parser.add_argument('--greedy_steps',
                        type=int,
                        default=5,
                        metavar='N',
                        help='amount of times greedy goes (default: 100)')

    args = parser.parse_args()

    #env = gym.make(args.env_name)

    env = Env()

    #env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # -- initialize agent, Q and Q' --
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                env.observation_space.shape[0], env.action_space)

    # -- declare memory buffer and random process N
    memory = ReplayMemory(args.replay_size)
    memory_g = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None

    # -- load existing model --
    if args.load_agent:
        agent.load_model(args.env_name, args.batch_size, '.pth')
        print("agent: naf_{}_{}_{}, is loaded").format(args.env_name,
                                                       args.batch_size, '.pth')
    # -- load experience buffer --
    if args.load_exp:
        with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1',
                  'rb') as input:
            memory.memory = pickle.load(input)
            memory.position = len(memory)

    #sate_Q_plot(agent, 50)

    rewards = []
    total_numsteps = 0
    greedy_reward = []
    avg_greedy_reward = []
    upper_reward = []
    lower_reward = []
    steps_to_goal = []
    avg_steps_to_goal = []
    state_plot = []

    sim_reset_start()

    pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10)
    rospy.Subscriber("/ee_rl/state", StateMsg, callback)
    rate = rospy.Rate(9)
    rate.sleep()

    for i_episode in range(args.num_episodes + 1):
        # -- reset environment for every episode --
        sim_reset()
        state = torch.Tensor(subdata).unsqueeze(0)

        # -- initialize noise (random process N) --
        if args.ou_noise:
            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end - i_episode / args.exploration_end +
                args.final_noise_scale)
            ounoise.reset()

        episode_reward = 0

        while True:
            # -- action selection, observation and store transition --
            action = agent.select_action(
                state,
                ounoise) if args.train_model else agent.select_action(state)
            a = action.numpy()[0] * 50
            act_pub = [a[0], a[1]]
            pub.publish(act_pub)
            next_state = torch.Tensor(subdata).unsqueeze(0)
            reward, done, _ = env.calc_shaped_reward(next_state)

            total_numsteps += 1
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])

            memory.push(state, action, mask, next_state, reward)
            # if done:
            #     for i in range(total_numsteps % args.num_steps):
            #         a = i+1
            #         memory_g.memory.append(memory.memory[-a])
            #         memory_g.position += 1

            state = next_state

            #-- training --
            # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model:
            #     for _ in range(10):
            #         transitions_b = memory.sample(args.batch_size/2)
            #         transitions_g = memory_g.sample(args.batch_size/2)
            #         for i in range(transitions_g):
            #             transitions_b.append(transitions_g[i])
            #         batch = Transition(*zip(*transitions_b))
            #         agent.update_parameters(batch)

            if len(memory) > args.batch_size and args.train_model:
                for _ in range(10):
                    transitions = memory.sample(args.batch_size)
                    batch = Transition(*zip(*transitions))
                    agent.update_parameters(batch)

            else:
                time.sleep(0.1)
            rate.sleep()

            if done or total_numsteps % args.num_steps == 0:
                break

        pub.publish([0, 0])
        rewards.append(episode_reward)

        # -- plot Q value --
        if i_episode % 10 == 0:

            sate_Q_plot(agent, i_episode)
            # -- saves model --
            if args.save_agent:
                agent.save_model(args.env_name, args.batch_size, i_episode,
                                 '.pth')
                with open('exp_replay.pk1', 'wb') as output:
                    pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
                #with open('exp_replay_g.pk1', 'wb') as output:
                #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

        if args.train_model:
            greedy_episode = max(args.num_episodes / 100, 5)
        else:
            greedy_episode = 10
        greedy_range = min(args.greedy_steps, greedy_episode)

        # -- calculates episode without noise --
        if i_episode % greedy_episode == 0 and not i_episode == 0:
            for _ in range(0, greedy_range + 1):
                # -- reset environment for every episode --
                sim_reset()
                state_visited = []
                action_taken = []
                print("Greedy episode ongoing")

                state = torch.Tensor(subdata).unsqueeze(0)
                episode_reward = 0
                steps = 0

                state_plot.append([])
                st = state.numpy()[0]
                sta = [st[0], st[1]]
                state_plot[_].append(sta)

                while True:
                    action = agent.select_action(state)
                    a = action.numpy()[0] * 50
                    act_pub = [a[0], a[1]]
                    pub.publish(act_pub)
                    next_state = torch.Tensor(subdata).unsqueeze(0)
                    reward, done, obs_hit = env.calc_shaped_reward(next_state)
                    episode_reward += reward

                    state_visited.append(state)
                    action_taken.append(action)

                    state = next_state

                    steps += 1
                    if done or steps == args.num_steps:
                        greedy_reward.append(episode_reward)
                        break
                    rate.sleep()

                if obs_hit:
                    steps = 300

                steps_to_goal.append(steps)

                # -- plot path --
                if i_episode % 10 == 0:
                    agent.plot_path(state_visited, action_taken, i_episode)

            upper_reward.append((np.max(greedy_reward[-greedy_range:])))
            lower_reward.append((np.min(greedy_reward[-greedy_range:])))
            avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:])))
            avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:])))

            print(
                "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}"
                .format(i_episode, total_numsteps, avg_greedy_reward[-1],
                        np.mean(rewards[-greedy_episode:])))

    #-- saves model --
    if args.save_agent:
        agent.save_model(args.env_name, args.batch_size, i_episode, '.pth')
        with open('exp_replay.pk1', 'wb') as output:
            pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)
        #with open('exp_replay_g.pk1', 'wb') as output:
        #    pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL)

    print('Training ended after {} minutes'.format(
        (time.time() - t_start) / 60))
    print('Time per ep : {} s').format(
        (time.time() - t_start) / args.num_episodes)
    print('Mean greedy reward: {}'.format(np.mean(greedy_reward)))
    print('Mean reward: {}'.format(np.mean(rewards)))
    print('Max reward: {}'.format(np.max(rewards)))
    print('Min reward: {}'.format(np.min(rewards)))

    # -- plot learning curve --
    pos_greedy = []
    for pos in range(0, len(lower_reward)):
        pos_greedy.append(pos * greedy_episode)

    plt.title('Greedy policy outcome')
    plt.fill_between(pos_greedy,
                     lower_reward,
                     upper_reward,
                     facecolor='red',
                     alpha=0.3)
    plt.plot(pos_greedy, avg_greedy_reward, 'r')
    plt.xlabel('Number of episodes')
    plt.ylabel('Rewards')
    fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname1)
    plt.close()

    plt.title('Steps to reach goal')
    plt.plot(steps_to_goal)
    plt.ylabel('Number of steps')
    plt.xlabel('Number of episodes')
    fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size,
                                         '.png')
    plt.savefig(fname2)
    plt.close()
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch X-job')
    parser.add_argument('--env_name',
                        default="Pendulum-v0",
                        help='name of the environment')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.001,
                        help='discount factor for model (default: 0.001)')
    parser.add_argument('--ou_noise', type=bool, default=True)
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.4,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.4)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=33,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    parser.add_argument('--seed',
                        type=int,
                        default=4,
                        metavar='N',
                        help='random seed (default: 4)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=200,
                        metavar='N',
                        help='batch size (default: 512)')
    parser.add_argument('--num_steps',
                        type=int,
                        default=100,
                        metavar='N',
                        help='max episode length (default: 300)')
    parser.add_argument('--num_episodes',
                        type=int,
                        default=5000,
                        metavar='N',
                        help='number of episodes (default: 5000)')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='hidden size (default: 128)')
    parser.add_argument('--updates_per_step',
                        type=int,
                        default=5,
                        metavar='N',
                        help='model updates per simulator step (default: 50)')
    parser.add_argument('--replay_size',
                        type=int,
                        default=1000000,
                        metavar='N',
                        help='size of replay buffer (default: 1000000)')
    parser.add_argument('--save_agent',
                        type=bool,
                        default=True,
                        help='save model to file')
    parser.add_argument('--train_model',
                        type=bool,
                        default=True,
                        help='Training or run')
    parser.add_argument('--load_agent',
                        type=bool,
                        default=False,
                        help='load model from file')
    parser.add_argument('--load_exp',
                        type=bool,
                        default=False,
                        help='load saved experience')
    parser.add_argument('--greedy_steps',
                        type=int,
                        default=10,
                        metavar='N',
                        help='amount of times greedy goes (default: 10)')

    args = parser.parse_args()

    env = ManipulateEnv()
    #env = gym.make(args.env_name)
    writer = SummaryWriter('runs/')

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # -- initialize agent --
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                env.observation_space.shape[0], env.action_space)

    # -- declare memory buffer and random process N
    memory = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None

    # -- load existing model --
    if args.load_agent:
        agent.load_model(args.env_name, args.batch_size, args.num_episodes,
                         '.pth')
        print("agent: naf_{}_{}_{}_{}, is loaded".format(
            args.env_name, args.batch_size, args.num_episodes, '.pth'))

    # -- load experience buffer --
    if args.load_exp:
        with open(
                '/home/quantao/Workspaces/catkin_ws/src/panda_demos/naf_env/src/exp_replay.pk1',
                'rb') as input:
            memory.memory = pickle.load(input)
            memory.position = len(memory)

    rewards = []
    total_numsteps = 0
    updates = 0

    #env.init_ros()
    #env.reset()

    t_start = time.time()

    for i_episode in range(args.num_episodes + 1):
        # -- reset environment for every episode --
        #state = env.reset()
        state = torch.Tensor([env.reset()])

        # -- initialize noise (random process N) --
        if args.ou_noise:
            ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
                0, args.exploration_end - i_episode / args.exploration_end +
                args.final_noise_scale)
            ounoise.reset()

        episode_reward = 0
        while True:
            # -- action selection, observation and store transition --
            action = agent.select_action(
                state,
                ounoise) if args.train_model else agent.select_action(state)

            next_state, reward, done, info = env.step(action)

            #env.render()
            total_numsteps += 1
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])
            next_state = torch.Tensor([next_state])

            #print('reward:', reward)
            memory.push(state, action, mask, next_state, reward)

            state = next_state

            #else:
            #    time.sleep(0.005)

            #env.render()
            #time.sleep(0.005)
            #env.rate.sleep()

            if done or total_numsteps % args.num_steps == 0:
                break

        if len(memory) >= args.batch_size and args.train_model:
            env.reset()
            print("Training model")

            for _ in range(args.updates_per_step * args.num_steps):
                transitions = memory.sample(args.batch_size)
                batch = Transition(*zip(*transitions))
                value_loss, policy_loss = agent.update_parameters(batch)

                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)

                updates += 1
        writer.add_scalar('reward/train', episode_reward, i_episode)
        print("Train Episode: {}, total numsteps: {}, reward: {}".format(
            i_episode, total_numsteps, episode_reward))

        rewards.append(episode_reward)

        greedy_numsteps = 0
        if i_episode % 10 == 0:
            #state = env.reset()
            state = torch.Tensor([env.reset()])

            episode_reward = 0
            while True:
                action = agent.select_action(state)

                next_state, reward, done, info = env.step(action)
                episode_reward += reward
                greedy_numsteps += 1

                #state = next_state
                state = torch.Tensor([next_state])

                #env.render()
                #time.sleep(0.01)
                #   env.rate.sleep()

                if done or greedy_numsteps % args.num_steps == 0:
                    break

            writer.add_scalar('reward/test', episode_reward, i_episode)

            rewards.append(episode_reward)
            print(
                "Episode: {}, total numsteps: {}, reward: {}, average reward: {}"
                .format(i_episode, total_numsteps, rewards[-1],
                        np.mean(rewards[-10:])))

    #-- saves model --
    if args.save_agent:
        agent.save_model(args.env_name, args.batch_size, args.num_episodes,
                         '.pth')
        with open('exp_replay.pk1', 'wb') as output:
            pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL)

    print('Training ended after {} minutes'.format(
        (time.time() - t_start) / 60))
    print('Time per episode: {} s'.format(
        (time.time() - t_start) / args.num_episodes))
    print('Mean reward: {}'.format(np.mean(rewards)))
    print('Max reward: {}'.format(np.max(rewards)))
    print('Min reward: {}'.format(np.min(rewards)))
Exemplo n.º 3
0
def fit_nash():
    suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode)
    # reward_file = open('reward' + suffix + '.txt', 'w')
    # attack_file = open('attacker_action' + suffix + '.txt', 'w')
    # weight_file = open('vehicle_weight' + suffix + '.txt', 'w')
    # distance_file = open('Distance' + suffix + '.txt', 'w')



#     reward_file.write("""
# Environment Initializing...
# The initial head car velocity is {}
# The initial safe distance is     {}
# The Nash Eq* Factor RC is        {}
# The Reward Calculation Mode is   {}
# The Attack Mode is               {}
# The Nash Mode is                 {}
# """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash))

    # reward_file.close()
    # attack_file.close()
    # weight_file.close()
    # distance_file.close()

    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space, 'veh')
    agent_attacker = NAF(args.gamma, args.tau, args.hidden_size,
                         env.observation_space, env.attacker_action_space, 'att')
    try:
        agent_vehicle.load_model('models/vehicle_' + suffix)
        print('Load vehicle RL model successfully')

    except:
        print('No existed vehicle RL model')
    try:
        agent_attacker.load_model('models/attacker_' + suffix)
        print('Load attacker RL model successfully')

    except:
        print('No existed attacker RL model')
    try:
        policy_vehicle = load_model('models/vehicle_' + suffix + '.h5')
        print('Load vehicle SL model successfully')
    except:
        policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle')
    try:
        policy_attacker = load_model('models/attacker_' + suffix + '.h5')
        print('Load attacker SL model successfully')
    except:
        policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker')
    print('*'*20, '\n\n\n')
    memory_vehicle = ReplayMemory(100000)
    memory_attacker = ReplayMemory(100000)

    memory_SL_vehicle = ReplayMemory(400000)
    memory_SL_attacker = ReplayMemory(400000)

    ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None
    ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None

    param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                 desired_action_stddev=args.noise_scale,
                                                 adaptation_coefficient=1.05) if args.param_noise else None
    param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                                  desired_action_stddev=args.noise_scale,
                                                  adaptation_coefficient=1.05) if args.param_noise else None
    res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance'])
    reward_data = pd.DataFrame(columns=['Reward'])
    rewards = []
    total_numsteps = 0
    for i_episode in range(args.num_episodes):
        if i_episode % 100 == 0 and i_episode != 0:
            print('Writing to CSV files...')
            reward_data.to_csv(suffix + '.csv', index=False)
            res_data.to_csv(suffix + '.csv', index=False)

        if args.NashMode == 0:
            ETA = 0
        elif args.NashMode == 1:
            ETA = 0.5
        elif args.NashMode == 2:
            ETA = 0.1 - i_episode/args.num_episodes * 0.1

        print('No.{} episode starts... ETA is {}'.format(i_episode, ETA))

        # reward_file = open('reward' + suffix + '.txt', 'a')
        # attack_file = open('attacker_action' + suffix + '.txt', 'a')
        # weight_file = open('vehicle_weight' + suffix + '.txt', 'a')
        # distance_file = open('Distance' + suffix + '.txt', 'a')

        local_steps = 0
        state = env.reset()
        state_record = [np.array([state])]
        episode_steps = 0
        while len(state_record) < 20:
            a, b = env.random_action()
            s, _, _ = env.step(np.array([a]), np.zeros(4))
            local_steps += 1
            state_record.append(s)
        if args.ou_noise:
            ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                      i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_vehicle.reset()

            ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                                       i_episode) / args.exploration_end + args.final_noise_scale
            ounoise_attacker.reset()
        episode_reward = 0
        local_steps = 0
        while True:
            sigma = random.random()
            if sigma > ETA:
                # print(state_record[-20:])
                # print('rl', torch.Tensor(state_record[-20:]).shape)
                action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                             param_noise_vehicle)[:, -1, :]
                # print('rl', action_vehicle.shape)
                action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                               param_noise_attacker)[:, -1, :]
                # print('rl', action_vehicle.shape)
            else:
                action_vehicle = torch.Tensor(
                    [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]
                action_attacker = torch.Tensor(
                    [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict(
                        state_record[-1].reshape(-1, 4)).sum()])[0]

            # 限制权重和为1
            action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum())
            action_attacker = action_attacker.numpy()[0]

            next_state, reward, done = env.step(action_vehicle, action_attacker)
            res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}])

            # 将处理的攻击值赋给原值
            action_attacker = env.action_attacker

            total_numsteps += 1
            episode_reward += reward

            state_record.append(next_state)
            local_steps += 1
            episode_steps += 1

            if sigma > ETA:
                memory_SL_vehicle.append(state_record[-1], action_vehicle)
                memory_SL_attacker.append(state_record[-1], action_attacker)

            action_vehicle = torch.Tensor(action_vehicle.reshape(1,4))
            action_attacker = torch.Tensor(action_attacker.reshape(1,4))

            mask = torch.Tensor([not done])
            prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1)
            next_state = torch.Tensor([next_state])

            reward_vehicle = torch.Tensor([reward])
            reward_attacker = torch.Tensor([RC - reward])

            memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle)
            memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker)

            if done:
                rewards.append(episode_reward)
                print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward))
                reward_data = reward_data.append([{'Reward': episode_reward}])
                    # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward))
                break

        if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size:  # 开始训练
            for _ in range(args.updates_per_step):
                transitions_vehicle = memory_vehicle.sample(args.batch_size)
                batch_vehicle = Transition(*zip(*transitions_vehicle))

                transitions_attacker = memory_attacker.sample(args.batch_size)
                batch_attacker = Transition(*zip(*transitions_attacker))

                trans_veh = memory_SL_vehicle.sample(args.batch_size)
                trans_att = memory_SL_attacker.sample(args.batch_size)

                states_veh = []
                actions_veh = []
                states_att = []
                actions_att = []
                for sample in trans_veh:
                    state_veh, act_veh = sample
                    states_veh.append(state_veh)
                    actions_veh.append(act_veh)
                for sample in trans_att:
                    state_att, act_att = sample
                    states_att.append(state_att)
                    actions_att.append(act_att)

                states_veh = np.reshape(states_veh, (-1, env.observation_space))
                states_att = np.reshape(states_att, (-1, env.observation_space))
                actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space))
                actions_att = np.reshape(actions_att, (-1, env.attacker_action_space))

                policy_vehicle.fit(states_veh, actions_veh, verbose=False)
                policy_attacker.fit(states_att, actions_att, verbose=False)
                agent_vehicle.update_parameters(batch_vehicle)
                agent_attacker.update_parameters(batch_attacker)

                # writer.add_scalar('loss/value', value_loss, updates)
                # writer.add_scalar('loss/policy', policy_loss, updates)

        if i_episode % 10 == 0 and i_episode != 0:

            eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance'])


            # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode))
            state = env.reset()
            state_record = [np.array([state])]
            evaluate_reward = 0
            while len(state_record) < 20:
                a, b = env.random_action()
                s, _, _ = env.step(np.array([a]), np.zeros(4))
                local_steps += 1
                state_record.append(s)
            while True:
                if random.random() < ETA:
                    action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle,
                                                                 param_noise_vehicle)[:, -1, :]
                    # print('rl', action_vehicle.shape)
                    action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker,
                                                                   param_noise_attacker)[:, -1, :]
                else:
                    action_vehicle = torch.Tensor(
                        [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict(
                            state_record[-1].reshape(-1, 4)).sum()])[0]
                    action_attacker = torch.Tensor(
                        [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0]

                action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum()
                action_attacker = action_attacker.numpy()[0]
                next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2)

                eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}])
                evaluate_reward += reward


                if done:
                    print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode,
                                                                                                   total_numsteps,
                                                                                                   evaluate_reward,
                                                                                                   np.mean(rewards[-10:])))
                    # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode,
                    #                                                                                              total_numsteps,
                    #                                                                                              evaluate_reward,
                    #                                                                                              np.mean(rewards[-10:])))
                    break
        #         # writer.add_scalar('reward/test', episode_reward, i_episode)
        # reward_file.close()
        # attack_file.close()
        # weight_file.close()
        # distance_file.close()
    env.close()
    reward_data.to_csv(suffix+'_reward.csv', index=False)
    res_data.to_csv(suffix+'.csv', index=False)
    eva_res_data.to_csv(suffix+'_eva.csv', index=False)

    # save model
    agent_vehicle.save_model('vehicle_'+suffix)
    agent_attacker.save_model('attacker_'+suffix)

    policy_attacker.save('models/attacker_'+suffix+'.h5')
    policy_vehicle.save('models/vehicle_'+suffix+'.h5')
Exemplo n.º 4
0
        #         action = agent.select_action(state)
        #
        #         next_state, reward, done, _ = env.step(action.numpy()[0])
        #         episode_reward += reward
        #
        #         next_state = torch.Tensor([next_state])
        #         t+=1
        #         state = next_state
        #         if done or t==args.num_steps:
        #             break
        #
        #
        #
        #     rewards.append(episode_reward)
        #     print("Test: Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:])))

    #把数据转化为exel中
    reward = np.array(rewards)
    data = pd.DataFrame(reward)
    writer = pd.ExcelWriter('NAF_dual_rewards.xlsx')  # 写入Excel文件
    data.to_excel(writer, 'page_1', float_format='%.5f')
    # ‘page_1’是写入excel的sheet名
    ep_r_hist = np.reshape(reward, [-1, 1])
    # plot rewards
    plt.figure(1)
    plt.plot(ep_r_hist)
    plt.savefig('NAF_dual_obs.png')
    writer.save()
    writer.close()
    agent.save_model(args.env_name, "6DOF")
Exemplo n.º 5
0
    if i_episode % 10 == 0:
        state = torch.Tensor([env.reset()]).to(device)
        episode_reward = 0
        while True:
            action = agent.select_action(state)

            next_state, reward, done, _ = env.step(
                np.append(args.heading_speed,
                          action.numpy()[0]))
            episode_reward += reward

            next_state = torch.Tensor([next_state]).to(device)

            state = next_state
            if done:
                break

        rewards_test.append(episode_reward)
        print(
            "Episode: {}, total numsteps: {}, test_reward: {}, test_average reward: {}"
            .format(i_episode, total_numsteps, rewards_test[-1],
                    np.mean(rewards_test[-10:])))

        logger.log_scalar_rl("reward/test", rewards_test,
                             args.sliding_window_size,
                             [i_episode, total_numsteps, updates])
        agent.save_model(args.env_name, args.suffix)

env.close()