Exemplo n.º 1
0
    def __init__(self):
        # if gpu is to be used
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = device
        self.policy_net = ActorCritic().to(device).double()
        self.target_net = ActorCritic().to(device).double()
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.lr = 1e-5
        self.optimizer = optim.Adam([
            {
                "params": self.policy_net.head_a_m.parameters()
            },
            {
                "params": self.policy_net.head_a_t.parameters()
            },
            {
                "params": self.policy_net.fc.parameters()
            },
        ],
                                    lr=self.lr)
        self.optimizer2 = optim.Adam([
            {
                "params": self.policy_net.head_v.parameters()
            },
            {
                "params": self.policy_net.fc.parameters()
            },
        ],
                                     lr=self.lr)
        self.memory = ReplayMemory(100000)
Exemplo n.º 2
0
if __name__ == '__main__':
    mp.set_start_method("spawn")
    os.environ['OMP_NUM_THREADS'] = '1'

    args = parser.parse_args()

    torch.cuda.set_device(args.gpu_id)

    torch.manual_seed(args.seed)
    env = create_atari_env(args.env_name, args)
    if args.black_box_attack:
        shared_model = ActorCritic_Substitude(env.observation_space.shape[0],
                                              env.action_space)
    else:
        shared_model = ActorCritic(env.observation_space.shape[0],
                                   env.action_space)
    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    # load a pre-trained model according to the ft-setting

    if args.ft_setting == 'full-ft':
        if args.env_name == 'BreakoutDeterministic-v4':
            fname = './agent/trained_model/breakout/11000.pth.tar'
        elif args.env_name == 'PongDeterministic-v4':
            fname = './agent/trained_model/pong/4000.pth.tar'
Exemplo n.º 3
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    print('Train with A3C')
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name, args)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    output_directory = 'outputs/' + args.env_name
    checkpoint_directory, result_directory = prepare_sub_folder(
        output_directory)
    print(f'checkpoint directory {checkpoint_directory}')
    time.sleep(10)
    state = env.reset()
    state = torch.from_numpy(state)
    done = True
    episode_length = 0
    total_step = 0
    rewards_ep = []
    policy_loss_ep = []
    value_loss_ep = []
    for epoch in range(100000000):
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())

        values = []
        log_probs = []
        rewards = []
        entropies = []

        # for step in range(args.num_steps):
        is_Terminal = False
        while not is_Terminal:
            episode_length += 1
            total_step += 1
            value, logit = model(state.unsqueeze(0))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())

            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            if done:
                # print(episode_length)
                print(
                    f'epoch {epoch} - steps {total_step} - total rewards {np.sum(rewards) + reward}'
                )
                total_step = 1
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                rewards_ep.append(np.sum(rewards))
                is_Terminal = True
                # break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model(state.unsqueeze(0))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        policy_loss_ep.append(policy_loss.detach().numpy()[0, 0])
        value_loss_ep.append(value_loss.detach().numpy()[0, 0])

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()

        if epoch % 1000 == 0:
            torch.save({'state_dict': model.state_dict()},
                       checkpoint_directory + '/' + str(epoch) + ".pth.tar")
            with open(result_directory + '/' + str(epoch) + '_rewards.pkl',
                      'wb') as f:
                pickle.dump(rewards_ep, f)
            with open(result_directory + '/' + str(epoch) + '_policy_loss.pkl',
                      'wb') as f:
                pickle.dump(policy_loss_ep, f)
            with open(result_directory + '/' + str(epoch) + '_value_loss.pkl',
                      'wb') as f:
                pickle.dump(value_loss_ep, f)

        if episode_length >= 10000000:
            break

    torch.save({
        'state_dict': model.state_dict(),
    }, checkpoint_directory + '/Last' + ".pth.tar")
Exemplo n.º 4
0
def transfer_defense(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    rl_vaegan_path = 'rl_vaegan/output/' + args.env_name + '/checkpoints'

    env = create_atari_env(args.env_name, args)
    '''load trained RL-VAEGAN model'''
    import rl_vaegan.transfer as t
    translate_model = t.TransferModel()
    translate_model.initialize(rl_vaegan_path, arg.which_epoch, args)

    env.seed(args.seed + rank)
    if args.black_box_attack:
        print('Black Box Attack')
        model = ActorCritic_Substitude(env.observation_space.shape[0],
                                       env.action_space)
    else:
        print('White Box Attack')
        model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if args.test_attacker == 'rand_fgsm':
        test_alpha_adv = args.test_epsilon_adv * 0.5

    print(
        f'FGSM test on attacker: {args.test_attacker} - epsilon: {args.test_epsilon_adv}'
    )

    if args.test_attacker == 'cw2':
        test_iteration = 30
    else:
        test_iteration = 30

    state = env.reset()
    state = torch.from_numpy(state).unsqueeze(0).cuda()
    reward_sum = 0
    done = True
    episode_length = 0
    total_step = 0
    actions = deque(maxlen=100)
    reward_ep = []
    for epoch in range(test_iteration):
        model.load_state_dict(shared_model.state_dict())
        model.eval().cuda()
        rewards = []
        # for step in range(args.num_steps):
        is_Terminal = False
        while not is_Terminal:
            episode_length += 1
            total_step += 1
            with torch.no_grad():
                value, logit = model(state)
            prob = F.softmax(logit, dim=-1)
            action = prob.multinomial(num_samples=1)[0]
            '''adversarial attack'''
            if args.variation == 'adversary':
                if args.test_attacker == 'fgsm':
                    # args.epsilon_adv = random.randint(1,5) * 0.001
                    state_adv = FGSM(model,
                                     name='a3c',
                                     eps=args.test_epsilon_adv)._attack(
                                         state, action)  #(1,3,80,80)
                elif args.test_attacker == 'rand_fgsm':
                    # args.epsilon_adv = random.randint(2,5) * 0.001
                    # args.alpha_adv = args.epsilon_adv * 0.5
                    state_adv = RandFGSM(model,
                                         name='a3c',
                                         eps=args.test_epsilon_adv,
                                         alpha=test_alpha_adv)._attack(
                                             state, action)
                elif args.test_attacker == 'cw2':
                    state_adv = CW2(model, name='a3c')._attack(
                        state, action, env.action_space.n)
                else:
                    sys.exit('with attacker in (FGSM | Rand+FGSM | CW2) !')
            '''rl_vaegan style transfer defense'''
            state_def = translate_model.transform_adv(state_adv)

            with torch.no_grad():
                value_def, logit_def = model(state_def)

            prob_def = F.softmax(logit_def, dim=-1)
            action_def = prob_def.multinomial(num_samples=1)[0]

            state, reward, done, _ = env.step(action_def.item())
            done = done or episode_length >= args.max_episode_length
            actions.append(action_def.item())
            # a quick hack to prevent the agent from stucking
            if actions.count(actions[0]) == actions.maxlen:
                done = True
            if done:
                # print(episode_length)
                print(
                    f'epoch {epoch} | {test_iteration} - steps {episode_length} - total rewards {np.sum(rewards) + reward}'
                )
                reward_ep.append(np.sum(rewards) + reward)
                print('episode rewards:', reward_ep, 'avg: ',
                      np.sum(reward_ep) / len(reward_ep))
                episode_length = 0
                actions.clear()
                state = env.reset()
            rewards.append(reward)
            state = torch.from_numpy(state).unsqueeze(0).cuda()
            if done:
                is_Terminal = True

    print('episode rewards:', reward_ep, 'avg: ',
          np.sum(reward_ep) / len(reward_ep))