Пример #1
0
def collect_data():
    env = Osillator()
    EP_NUM = 1500
    data_set = []
    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            action = ppo.choose_action(state.cpu().data.numpy(), False)
            with torch.no_grad():
                ca1 = model_1(state)
                ca2 = model_2(state)
            control_action = ca1 * action[0] + ca2 * action[1]

            next_state, reward, done = env.step(
                control_action.cpu().data.numpy()[0])
            data_set.append([
                state.cpu().data.numpy()[0],
                state.cpu().data.numpy()[1],
                np.clip(control_action.cpu().data.numpy()[0], -1, 1)
            ])
            state = next_state
            if done:
                break
        print(t)
    return np.array(data_set)
Пример #2
0
def train_weight_adapter_DDPG(EP_NUM=2000):
	mkdir('./adapter_soft')
	env = Osillator()
	scores_deque = deque(maxlen=100)
	scores = []

	for ep in range(EP_NUM):
		state = env.reset()
		agent.reset()
		score = 0
		for t in range(200):
			action = agent.act(state)
			ca1 = model_1(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0]
			ca2 = model_2(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0]
			control_action = action[0]*ca1 + action[1]*ca2
			next_state, _, done = env.step(control_action, smoothness=0.5)
			reward = 5
			reward -= weight * abs(control_action) * 20
			reward -= 1 / weight * (abs(next_state[0]) + abs(next_state[1]))
			if done and t < 95:
				reward -= 100
			agent.step(state, action, reward, next_state, done, t)
			score += reward
			state = next_state            
			if done:
				break
		scores_deque.append(score)
		scores.append(score)
		score_average = np.mean(scores_deque)
		if ep % 1 == 0:
			print('\rEpisode {}, Average Score: {:.2f}, Current Score:{:.2f}, Max: {:.2f}, Min: {:.2f}, Epsilon: {:.2f}, Momery:{:.1f}'\
				  .format(ep, score_average,  scores[-1], np.max(scores), np.min(scores), agent.epsilon, len(agent.memory)), end="\n")     
		if ep > 0 and ep % 100 == 0:
			torch.save(agent.actor_local.state_dict(), './adapter_soft/adapter_'+str(ep)+'_'+str(weight)+ '.pth')
Пример #3
0
def collect_data(adapter_name, INDI_NAME):
    assert EXP1 == True
    env = Osillator()
    model = Weight_adapter(2, 2).to(device)
    model.load_state_dict(torch.load(adapter_name))
    EP_NUM = 1500
    data_set = []
    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            action = model(state).cpu().data.numpy()
            with torch.no_grad():
                ca1 = model_1(state)
                ca2 = model_2(state)
            control_action = ca1 * action[0] + ca2 * action[1]

            next_state, reward, done = env.step(
                control_action.cpu().data.numpy()[0])
            data_set.append([
                state.cpu().data.numpy()[0],
                state.cpu().data.numpy()[1],
                control_action.cpu().data.numpy()[0]
            ])
            state = next_state
            if done:
                break
        print(ep_loss, t)
    return np.array(data_set)
Пример #4
0
def train_switcher_DDQN():
    mkdir('./adapter_ab')
    env = Osillator()
    model = DQN(2, 2).to(device)
    target_model = DQN(2, 2).to(device)
    optimizer = optim.Adam(model.parameters())
    EP_NUM = 2001
    frame_idx = 0
    fuel_list = []
    ep_reward = deque(maxlen=100)

    for ep in range(EP_NUM):
        state = env.reset()
        ep_r = 0
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            epsilon = epsilon_by_frame(frame_idx)
            action = model.act(state, epsilon)
            with torch.no_grad():
                if action == 0:
                    control_action = model_1(state).cpu().data.numpy()[0]
                elif action == 1:
                    control_action = model_2(state).cpu().data.numpy()[0]
                else:
                    assert False
                    control_action = 0
            next_state, _, done = env.step(control_action)
            reward = 2
            reward -= weight * abs(control_action) * 20
            if done and t < 190:
                reward -= 100
            replay_buffer.push(state.cpu().numpy(), action, reward, next_state,
                               done)
            fuel_list.append(abs(control_action) * 20)
            state = next_state
            ep_r += reward
            frame_idx += 1
            if len(replay_buffer) > batch_size:
                loss = compute_td_loss(model, target_model, batch_size,
                                       optimizer)
            if frame_idx % 100 == 0:
                update_target(model, target_model)
            if done:
                break
        ep_reward.append(ep_r)
        print('epoch:', ep, 'reward:', ep_r, 'average reward:',
              np.mean(ep_reward), 'fuel cost:', sum(fuel_list[-t - 1:]),
              'epsilon:', epsilon, len(replay_buffer))
        if ep >= 100 and ep % 100 == 0:
            torch.save(
                model.state_dict(),
                './adapter_ab/ddqn_' + str(ep) + '_' + str(weight) + '.pth')
Пример #5
0
def train():
    env = Osillator()
    state_dim = 2
    action_dim = 2

    # reproducible
    # env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    torch.manual_seed(RANDOMSEED)

    ppo = PPO(state_dim, action_dim, method=METHOD)
    global all_ep_r, update_plot, stop_plot
    all_ep_r = []
    for ep in range(EP_MAX):
        s = env.reset()
        ep_r = 0
        t0 = time.time()
        for t in range(EP_LEN):
            if RENDER:
                env.render()
            a = ppo.choose_action(s)
            u = gene_u(s, a, model_1, model_2)
            s_, _, done = env.step(u)
            # print(s, a, s_, r, done)
            # assert False
            r = 10
            r -= WEIGHT * abs(np.clip(u, -1, 1)) * 20
            r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1]))
            if done and t < 95:
                r -= 100
            ppo.store_transition(
                s, a, r
            )  # useful for pendulum since the nets are very small, normalization make it easier to learn
            s = s_
            ep_r += r

            # update ppo
            if len(ppo.state_buffer) == BATCH_SIZE:
                ppo.finish_path(s_, done)
                ppo.update()
            # if done:
            #     break
        ppo.finish_path(s_, done)
        print(
            'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.
            format(ep + 1, EP_MAX, ep_r,
                   time.time() - t0))
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
        if PLOT_RESULT:
            update_plot.set()
        if (ep + 1) % 500 == 0 and ep >= 3000:
            ppo.save_model(path='ppo', ep=ep, weight=WEIGHT)
    if PLOT_RESULT:
        stop_plot.set()
    env.close()
Пример #6
0
def distill(adapter_name, INDI_NAME):
    optimizer = torch.optim.SGD(Individual.parameters(),
                                lr=0.001,
                                momentum=0.9)
    loss_func = torch.nn.MSELoss()
    env = Osillator()
    model = DQN(2, 2).to(device)
    EP_NUM = 500

    model.load_state_dict(torch.load(adapter_name))

    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            action = model.act(state, epsilon=0)
            with torch.no_grad():
                if action == 0:
                    control_action = model_1(state)
                elif action == 1:
                    control_action = model_2(state)

            control_action.requires_grad = False
            prediction = Individual(state)
            loss = loss_func(prediction, control_action)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            ep_loss += loss.item()

            next_state, reward, done = env.step(
                control_action.cpu().data.numpy()[0])
            state = next_state
            if done:
                break
        print(ep_loss)
    torch.save(Individual.state_dict(), INDI_NAME)
Пример #7
0
# this file is to train and test NN controller
# Invariant of Bernstein polynomial approximation is also shown here whose computation is referred to
# files in ./mat folder, where value-based method and polySOS are used
import gym.spaces
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from interval import Interval
from env import Osillator
import scipy.io as io
from scipy.interpolate import interp2d

env = Osillator()

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
	sys.path.append(module_path)
from Agent import Agent

def mkdir(path):
	folder = os.path.exists(path)
	if not folder:
		os.makedirs(path)

def save_model(i_episode, score_average):
	print("Model Save...")
	if score_average > 300:
Пример #8
0
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None):
	print(mode)
	env = Osillator()
	EP_NUM = 500
	if mode == 'switch':
		model = DQN(2, 2).to(device)
		model.load_state_dict(torch.load(adapter_name))
	if mode == 'weight':
		model = Weight_adapter(2, 2).to(device)
		model.load_state_dict(torch.load(adapter_name))
	if mode == 'individual':
		Individual.load_state_dict(torch.load(INDI_NAME))
	if renew:
		state_list = []
	fuel_list = []
	ep_reward = []
	trajectory = []
	safe = []
	unsafe = []
	control_action_list = []
	for ep in range(EP_NUM):
		if renew:
			state = env.reset()
			state_list.append(state)
		else:
			assert len(state_list) == EP_NUM
			state = env.reset(state_list[ep][0], state_list[ep][1])
		ep_r = 0
		fuel = 0
		if ep == 0:
			trajectory.append(state)
		for t in range(env.max_iteration):
			state = torch.from_numpy(state).float().to(device)
			if mode == 'switch':
				action = model.act(state, epsilon=0)
				with torch.no_grad():
					if action == 0:
						control_action = model_1(state).cpu().data.numpy()[0]
					elif action == 1:
						control_action = model_2(state).cpu().data.numpy()[0]
					else:
						assert False
						control_action = 0
			elif mode == 'ppo':
				action = ppo.choose_action(state.cpu().data.numpy(), True)
				ca1 = model_1(state).cpu().data.numpy()[0]
				ca2 = model_2(state).cpu().data.numpy()[0]
				control_action = action[0]*ca1 + action[1]*ca2
				if ep == 0:
					print(t, state, control_action, action, ca1, ca2)				

			elif mode == 'average':
				ca1 = model_1(state).cpu().data.numpy()[0]
				ca2 = model_2(state).cpu().data.numpy()[0]
				control_action = (ca1 + ca2)/2
			elif mode == 'planning':
				ca1 = model_1(state).cpu().data.numpy()[0]
				ca2 = model_2(state).cpu().data.numpy()[0]
				control_action = plan(state, ca1, ca2) 

			elif mode == 'd1':
				control_action = model_1(state).cpu().data.numpy()[0]
				if ep == 0:
					print(state, control_action)

			elif mode == 'd2':
				control_action = model_2(state).cpu().data.numpy()[0]
				
			elif mode == 'individual':
				if ATTACK:
					delta, original = fgsm(Individual, state)
					# delta = torch.from_numpy(np.random.uniform(low=-SCALE, high=SCALE, size=state.shape)).float().to(device)
					control_action = Individual(state+delta).cpu().data.numpy()[0]
				else:
					control_action = Individual(state).cpu().data.numpy()[0]

			next_state, reward, done = env.step(control_action)
			control_action = np.clip(control_action, -1, 1)
			fuel += abs(control_action) * 20
			state = next_state
			if ep == 0:
				trajectory.append(state)
				control_action_list.append(control_action)
			ep_r += reward
			if done:
				break
		
		ep_reward.append(ep_r)
		if t >= 95:
			fuel_list.append(fuel)
			safe.append(state_list[ep])
		else:
			print(ep, state_list[ep])
			unsafe.append(state_list[ep])
	safe = np.array(safe)
	unsafe = np.array(unsafe)
	np.save('./plot/'+mode+'_safe.npy', safe)
	np.save('./plot/'+mode+'_unsafe.npy', unsafe)
	return ep_reward, np.array(fuel_list), state_list, np.array(control_action_list)
Пример #9
0
def test(adapter_name=None,
         state_list=None,
         renew=False,
         mode='switch',
         INDI_NAME=None):
    print(mode)
    env = Osillator()
    model = DQN(2, 2).to(device)
    EP_NUM = 500
    if mode == 'switch':
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'individual':
        Individual.load_state_dict(torch.load(INDI_NAME))
    if renew:
        state_list = []
    fuel_list = []
    ep_reward = []
    trajectory = []
    for ep in range(EP_NUM):
        if renew:
            state = env.reset()
            state_list.append(state)
        else:
            assert len(state_list) == EP_NUM
            state = env.reset(state_list[ep][0], state_list[ep][1])
        ep_r = 0
        fuel = 0
        if ep == 0:
            trajectory.append(state)
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            # flag = where_inv(state.cpu().numpy())
            if mode == 'switch':
                action = model.act(state, epsilon=0)
                with torch.no_grad():
                    if action == 0:
                        control_action = model_1(state).cpu().data.numpy()[0]
                    elif action == 1:
                        control_action = model_2(state).cpu().data.numpy()[0]
                    else:
                        assert False
                        control_action = 0
                if ep == 0:
                    print(t, state, action, control_action * 20)

            elif mode == 'd1':
                control_action = model_1(state).cpu().data.numpy()[0]

            elif mode == 'd2':
                control_action = model_2(state).cpu().data.numpy()[0]

            elif mode == 'individual':
                control_action = Individual(state).cpu().data.numpy()[0]

            next_state, reward, done = env.step(control_action)
            fuel += abs(control_action) * 20
            state = next_state
            if ep == 0:
                trajectory.append(state)
            ep_r += reward
            if done:
                break

        ep_reward.append(ep_r)
        if t >= 95:
            fuel_list.append(fuel)
        else:
            print(ep, state_list[ep])
        if ep == 0:
            trajectory = np.array(trajectory)
            # plt.figure()
            plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode)
            plt.legend()
            plt.savefig('trajectory.png')
    return ep_reward, np.array(fuel_list), state_list
Пример #10
0
def test(adapter_name=None,
         state_list=None,
         renew=False,
         mode='switch',
         INDI_NAME=None):
    print(mode)
    env = Osillator()
    EP_NUM = 1
    if mode == 'switch':
        model = DQN(2, 2).to(device)
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'weight':
        model = Weight_adapter(2, 2).to(device)
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'individual':
        Individual.load_state_dict(torch.load(INDI_NAME))
    if renew:
        state_list = []
    fuel_list = []
    ep_reward = []
    trajectory = []
    safe = []
    unsafe = []
    control_action_list = []
    for ep in range(EP_NUM):
        if renew:
            state = env.reset()
            state_list.append(state)
        else:
            assert len(state_list) == EP_NUM
            state = env.reset(state_list[ep][0], state_list[ep][1])
        ep_r = 0
        fuel = 0
        if ep == 0:
            trajectory.append(state)
        for t in range(env.max_iteration):
            # attack happens here
            # state += np.random.uniform(low=-0.35, high=0.35, size=state.shape)
            state = torch.from_numpy(state).float().to(device)
            if mode == 'switch':
                action = model.act(state, epsilon=0)
                with torch.no_grad():
                    if action == 0:
                        control_action = model_1(state).cpu().data.numpy()[0]
                    elif action == 1:
                        control_action = model_2(state).cpu().data.numpy()[0]
                    else:
                        assert False
                        control_action = 0
            elif mode == 'weight':
                action = model(state).cpu().data.numpy()
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = action[0] * ca1 + action[1] * ca2
                if ep == 0:
                    print(t, state, control_action, action, ca1, ca2)
            elif mode == 'average':
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = (ca1 + ca2) / 2
            elif mode == 'planning':
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = plan(state, ca1, ca2)

            elif mode == 'd1':
                control_action = model_1(state).cpu().data.numpy()[0]

            elif mode == 'd2':
                control_action = model_2(state).cpu().data.numpy()[0]

            elif mode == 'individual':
                # delta, original = fgsm(Individual, state)
                # if ep == 0:
                # 	print(delta, original)
                # control_action = Individual(state+delta).cpu().data.numpy()[0]
                control_action = Individual(state).cpu().data.numpy()[0]

            next_state, reward, done = env.step(control_action)
            control_action = np.clip(control_action, -1, 1)
            fuel += abs(control_action) * 20
            state = next_state
            if ep == 0:
                trajectory.append(state)
                control_action_list.append(control_action)
            ep_r += reward
            if done:
                break

        ep_reward.append(ep_r)
        if t >= 95:
            fuel_list.append(fuel)
            safe.append(state_list[ep])
        else:
            print(ep, state_list[ep])
            unsafe.append(state_list[ep])
        if ep == 0:
            trajectory = np.array(trajectory)
            # plt.figure()
            plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode)
            plt.legend()
            plt.savefig('trajectory.png')
    # safe = np.array(safe)
    # unsafe = np.array(unsafe)
    # plt.figure()
    # plt.scatter(safe[:, 0], safe[:, 1], c='green')
    # plt.scatter(unsafe[:, 0], unsafe[:, 1], c='red')
    # plt.savefig('./safe_sample_plot/'+ mode +'.png')
    return ep_reward, np.array(fuel_list), state_list, np.array(
        control_action_list)
Пример #11
0
if __name__ == '__main__':

    # if args.train:
    #     thread = threading.Thread(target=train)
    #     thread.daemon = True
    #     thread.start()
    #     if PLOT_RESULT:
    #         drawer = Drawer()
    #         drawer.plot()
    #         drawer.save()
    #     thread.join()
    train()
    assert False
    # test
    env = Osillator()
    state_dim = 2
    action_dim = 2
    ppo = PPO(state_dim, action_dim, method=METHOD)
    ppo.load_model()
    mean_epoch_reward = 0
    for _ in range(TEST_EP):
        state = env.reset()
        for i in range(EP_LEN):
            if RENDER:
                env.render()
            action = ppo.choose_action(state, True)
            u = gene_u(state, action, model_1, model_2)
            next_state, reward, done = env.step(u)
            mean_epoch_reward += reward
            state = next_state