def __init__(self, env, optim=Adam, policy_lr=0.001, value_lr=0.001, policy_hidden_size=[32], value_hidden_size=[32], gamma=0.95, policy_lambda=0.9, value_lambda=0.9, batch_size=5000, epochs=50, update_every=50, render=False): self.env = env self.batch_size = batch_size self.render = render self.epochs = epochs self.gamma = gamma self.policy_lambda = policy_lambda self.value_lambda = value_lambda self.update_every = update_every obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.n self.policy_mlp = CategoricalMLP([obs_size] + policy_hidden_size + [action_size]) self.policy_optim = optim(self.policy_mlp.parameters(), lr=policy_lr) self.value_mlp = MLP([obs_size] + value_hidden_size + [1]) self.value_optim = optim(self.value_mlp.parameters(), lr=value_lr)
def __init__(self, env, optim=Adam, policy_lr=0.001, value_lr=0.001, policy_hidden_size=[64], value_hidden_size=[64], gamma=0.9, batch_size=5000, epochs=50, update_every=50, render=False): self.env = env self.batch_size = batch_size self.render = render self.epochs = epochs self.gamma = gamma self.update_every = update_every self.writer_count = 0 obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.shape[0] action_limit = env.action_space.high self.policy_mlp = GaussianMLP([obs_size] + policy_hidden_size + [action_size], action_limit) self.policy_optim = optim(self.policy_mlp.parameters(), lr=policy_lr) self.value_mlp = MLP([obs_size] + value_hidden_size + [1]) self.value_optim = optim(self.value_mlp.parameters(), lr=value_lr)
def __init__(self, env, optim=Adam, lr=0.01, hidden_size=[64], batch_size=5000, n_episodes=2000, render=False): self.env = env self.batch_size = batch_size self.n_episodes = n_episodes self.lr = lr self.render = render obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.n self.mlp = MLP([obs_size] + hidden_size + [action_size]) self.optim = optim(self.mlp.parameters(), lr=lr)
def __init__(self, env, optim=Adam, policy_lr=0.01, value_lr=0.1, policy_hidden_size=[32], value_hidden_size=[32], batch_size=5000, render=False): self.env = env self.batch_size = batch_size self.render = render obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.n self.policy_mlp = MLP([obs_size] + policy_hidden_size + [action_size]) self.policy_optim = optim(self.policy_mlp.parameters(), lr=policy_lr) self.value_mlp = MLP([obs_size] + value_hidden_size + [1]) self.value_optim = optim(self.value_mlp.parameters(), lr=value_lr)
def __init__(self): """ Virtually private constructor. """ if Ensembling.__instance != None: raise Exception("This class is a singleton!") else: Ensembling.__instance = self # for root, dirs, files in os.walk("D:\\MSc\\Chat Parser Script\\models\\ensemble"): # for foldername in dirs: # model = tf.keras.models.load_model("D:\\MSc\\Chat Parser Script\\models\\ensemble\\" + foldername, compile=False) # self.models[foldername] = model self.svmInstance = SVM.getInstance() self.mlpInstance = MLP.getInstance() self.tensorflowNNInstance = TensorflowNN.getInstance() self.naiveBayesInstance = NaiveBayes.getInstance()
def __init__(self): self.svmInstance = SVM.getInstance() self.tensorflowNNInstance = TensorflowNN.getInstance() self.mlpInstance = MLP.getInstance() self.naiveBayesInstance = NaiveBayes.getInstance() self.ensemblingInstance = Ensembling.getInstance()
class ActorCriticContinuous: def __init__(self, env, optim=Adam, policy_lr=0.001, value_lr=0.001, policy_hidden_size=[64], value_hidden_size=[64], gamma=0.9, batch_size=5000, epochs=50, update_every=50, render=False): self.env = env self.batch_size = batch_size self.render = render self.epochs = epochs self.gamma = gamma self.update_every = update_every self.writer_count = 0 obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.shape[0] action_limit = env.action_space.high self.policy_mlp = GaussianMLP([obs_size] + policy_hidden_size + [action_size], action_limit) self.policy_optim = optim(self.policy_mlp.parameters(), lr=policy_lr) self.value_mlp = MLP([obs_size] + value_hidden_size + [1]) self.value_optim = optim(self.value_mlp.parameters(), lr=value_lr) def train(self): for epoch in range(self.epochs): returns, lens = self.train_single_batch(render=self.render) print("Epoch %2d, Return: %5.1f, Length: %3d" % (epoch, np.mean(returns), np.mean(lens))) def train_single_batch(self, render=False): group_data = [] batch_returns = [] batch_lens = [] episode_rewards = [] done = False obs = self.env.reset() I_val = 1 first_episode_render = True for t in range(self.batch_size): if render and first_episode_render: self.env.render() curr_obs = obs action = self.get_action(torch.as_tensor(obs, dtype=torch.float32)) clamped_action = action.clamp(self.env.action_space.low.min(), self.env.action_space.high.max()) obs, reward, done, _ = self.env.step( clamped_action.detach().numpy()) episode_rewards.append(reward) group_data.append((curr_obs, action, reward, obs, done, I_val)) I_val *= self.gamma if t > 0 and t % self.update_every == 0: (error, value_loss, policy_loss, value_grad, policy_grad) = 0, 0, 0, 0, 0 for data in group_data: (error, value_loss, policy_loss, value_grad, policy_grad) = self.update(data) self.writer_count += 1 group_data = [] if done: ep_return, ep_len = sum(episode_rewards), len(episode_rewards) batch_returns.append(ep_return) batch_lens.append(ep_len) episode_rewards = [] obs, done = self.env.reset(), False I_val = 1 first_episode_render = False return batch_returns, batch_lens def update(self, data): obs, action, reward, next_obs, done, I_val = data obs = torch.as_tensor([obs], dtype=torch.float32) next_obs = torch.as_tensor([next_obs], dtype=torch.float32) action = torch.as_tensor([action], dtype=torch.float32) reward = torch.as_tensor(reward, dtype=torch.float32) error = self.get_value_error(obs, next_obs, reward, done) self.value_optim.zero_grad() value_loss = self.value_update(obs, error) value_loss.backward() # nn.utils.clip_grad_norm_(self.value_mlp.parameters(), 0.5) self.value_optim.step() self.policy_optim.zero_grad() policy_loss = self.policy_update(obs, action, error, I_val) policy_loss.backward() # nn.utils.clip_grad_norm_(self.policy_mlp.parameters(), 0.5) self.policy_optim.step() value_grad_sum = 0 policy_grad_sum = 0 for p in self.value_mlp.parameters(): value_grad_sum += p.sum() for p in self.policy_mlp.parameters(): policy_grad_sum += p.sum() return error, value_loss, policy_loss, value_grad_sum, policy_grad_sum def policy(self, obs): mlp_out = self.policy_mlp(obs) return Categorical(logits=mlp_out) def get_action(self, obs): policy_dist = self.policy_mlp(obs) action = policy_dist.rsample() return action def policy_update(self, obs, action, error, I): policy_dist = self.policy_mlp(obs) log_proba = policy_dist.log_prob(action).mean() return -(error * I * log_proba) def state_value(self, obs): mlp_out = self.value_mlp(obs) return mlp_out def value_update(self, obs, error): value = self.state_value(obs) return -(error * value) def get_value_error(self, obs, next_obs, reward, done): value = self.state_value(obs).clone().detach() next_value = 0 if done else self.state_value(next_obs).detach().clone() return (reward + self.gamma * next_value - value)
class ActorCriticEligibilityTrace: def __init__(self, env, optim=Adam, policy_lr=0.001, value_lr=0.001, policy_hidden_size=[32], value_hidden_size=[32], gamma=0.95, policy_lambda=0.9, value_lambda=0.9, batch_size=5000, epochs=50, update_every=50, render=False): self.env = env self.batch_size = batch_size self.render = render self.epochs = epochs self.gamma = gamma self.policy_lambda = policy_lambda self.value_lambda = value_lambda self.update_every = update_every obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.n self.policy_mlp = CategoricalMLP([obs_size] + policy_hidden_size + [action_size]) self.policy_optim = optim(self.policy_mlp.parameters(), lr=policy_lr) self.value_mlp = MLP([obs_size] + value_hidden_size + [1]) self.value_optim = optim(self.value_mlp.parameters(), lr=value_lr) def train(self): for epoch in range(self.epochs): returns, lens = self.train_single_batch(render=self.render) print("Epoch %2d, Return: %5.1f, Length: %3d" % (epoch, np.mean(returns), np.mean(lens))) def train_single_batch(self, render=False): batch_returns = [] batch_lens = [] episode_rewards = [] done = False obs = self.env.reset() I_val = 1 self.policy_trace = self.create_trace(self.policy_mlp) self.value_trace = self.create_trace(self.value_mlp) first_episode_render = True for t in range(self.batch_size): if render and first_episode_render: self.env.render() curr_obs = obs action, log_prob = self.policy_mlp( torch.as_tensor(obs, dtype=torch.float32)) obs, reward, done, _ = self.env.step(action.detach().numpy()) episode_rewards.append(reward) self.update((curr_obs, action, log_prob, reward, obs, done, I_val)) I_val *= self.gamma if done: ep_return, ep_len = sum(episode_rewards), len(episode_rewards) batch_returns.append(ep_return) batch_lens.append(ep_len) episode_rewards = [] obs, done = self.env.reset(), False I_val = 1 first_episode_render = False return batch_returns, batch_lens def update(self, data): obs, action, log_prob, reward, next_obs, done, I_val = data obs = torch.as_tensor([obs], dtype=torch.float32) next_obs = torch.as_tensor([next_obs], dtype=torch.float32) action = torch.as_tensor(action, dtype=torch.float32) reward = torch.as_tensor(reward, dtype=torch.float32) error = self.get_value_error(obs, next_obs, reward, done) self.value_optim.zero_grad() self.value_set_grad(obs, error) self.value_optim.step() self.policy_optim.zero_grad() self.policy_set_grad(obs, action, log_prob, error, I_val) self.policy_optim.step() def policy_set_grad(self, obs, action, log_prob, error, I): log_prob.backward() for i, p in enumerate(self.policy_mlp.parameters()): self.policy_trace[i] = ( self.gamma * self.policy_lambda * self.policy_trace[i] + I * p.grad) p.grad = -(error * self.policy_trace[i]) def state_value(self, obs): mlp_out = self.value_mlp(obs) return mlp_out def value_set_grad(self, obs, error): value = self.state_value(obs) value.backward() for i, p in enumerate(self.value_mlp.parameters()): self.value_trace[i] = ( self.gamma * self.value_lambda * self.value_trace[i] + p.grad) p.grad = -(error * self.value_trace[i]) def get_value_error(self, obs, next_obs, reward, done): value = self.state_value(obs).clone().detach() next_value = 0 if done else self.state_value(next_obs).clone().detach() return (reward + self.gamma * next_value - value).item() def create_trace(self, model): trace = [] for p in model.parameters(): trace.append(torch.zeros(p.shape)) return trace
class PolicyGradient: def __init__(self, env, optim=Adam, lr=0.01, hidden_size=[64], batch_size=5000, n_episodes=2000, render=False): self.env = env self.batch_size = batch_size self.n_episodes = n_episodes self.lr = lr self.render = render obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.n self.mlp = MLP([obs_size] + hidden_size + [action_size]) self.optim = optim(self.mlp.parameters(), lr=lr) def train(self): for epoch in range(50): render = False if self.render: render = True if epoch % 5 == 0 else False loss, returns, lens = self.train_single_batch(render=render) print("Epoch %2d, Loss %5.1f, Return: %5.1f, Length: %3d" % (epoch, loss.item(), np.mean(returns), np.mean(lens))) def train_single_batch(self, render=False): timestep = 0 batch_obss = [] batch_actions = [] batch_weights = [] batch_returns = [] batch_lens = [] episode_rewards = [] done = False obs = self.env.reset() first_episode_render = True while True: if render and first_episode_render: self.env.render() batch_obss.append(obs) action = self.get_action(torch.as_tensor(obs, dtype=torch.float32)) obs, reward, done, _ = self.env.step(action) batch_actions.append(action) episode_rewards.append(reward) timestep += 1 if done: episode_return = sum(episode_rewards) episode_len = len(episode_rewards) batch_returns.append(episode_return) batch_lens.append(episode_len) batch_weights += [ sum(episode_rewards[i:]) for i, _ in enumerate(episode_rewards) ] first_episode_render = False obs, done, episode_rewards = self.env.reset(), False, [] if len(batch_obss) > self.batch_size: break self.optim.zero_grad() batch_loss = self.policy_update( torch.as_tensor(batch_obss, dtype=torch.float32), torch.as_tensor(batch_actions, dtype=torch.float32), torch.as_tensor(batch_weights, dtype=torch.float32)) batch_loss.backward() self.optim.step() return batch_loss, batch_returns, batch_lens def policy(self, obs): mlp_out = self.mlp(obs) return Categorical(logits=mlp_out) def get_action(self, obs): policy_dist = self.policy(obs) action = policy_dist.sample().item() return action def policy_update(self, obs, actions, returns): policy_dist = self.policy(obs) log_proba = policy_dist.log_prob(actions) return -(returns * log_proba).mean()
if "NRF" in list(data): data.pop("NRF") if "POSTCR" in list(data): data.pop("POSTCR") if "OpID" in list(data): data.pop("OpID") if "PatID" in list(data): data.pop("PatID") if "DOA" in list(data): data.pop("DOA") y = data.pop("HAEMOFIL") ros = RandomOverSampler(random_state=1) scaler = scale() scaler.fit(data) print("full") param = {'layers':[2,5], 'nodes':[5,10], 'dropout':[0.4,0.8], 'epochs':[50]} gsearch = GridSearchCV(estimator = MLP(), param_grid = param, scoring='roc_auc', iid=False, cv=rkf_search, verbose=2) gsearch.fit(scaler.transform(data.values), y.values) clf = gsearch.best_estimator_ pd.DataFrame(gsearch.cv_results_).to_csv("output/HF/MLPfull.csv") output = cross_validate(clf, scaler.transform(data.values), y.values, scoring=metrics,cv=rkf, verbose=2,return_train_score=True) pd.DataFrame(output).to_csv('output/HF/performanceMLPfull.csv')
class ActorCritic: def __init__(self, env, optim=Adam, policy_lr=0.001, value_lr=0.001, policy_hidden_size=[32], value_hidden_size=[32], gamma=0.9, batch_size=5000, epochs=50, update_every=50, render=False): self.env = env self.batch_size = batch_size self.render = render self.epochs = epochs self.gamma = gamma self.update_every = update_every obs_size = np.prod(env.observation_space.shape) action_size = env.action_space.n self.policy_mlp = MLP([obs_size] + policy_hidden_size + [action_size]) self.policy_optim = optim(self.policy_mlp.parameters(), lr=policy_lr) self.value_mlp = MLP([obs_size] + value_hidden_size + [1]) self.value_optim = optim(self.value_mlp.parameters(), lr=value_lr) def train(self): for epoch in range(self.epochs): render = False if self.render: render = True if epoch % 5 == 0 else False returns, lens = self.train_single_batch(render=render) print("Epoch %2d, Return: %5.1f, Length: %3d" % (epoch, np.mean(returns), np.mean(lens))) def train_single_batch(self, render=False): group_data = [] batch_returns = [] batch_lens = [] episode_rewards = [] done = False obs = self.env.reset() I_val = 1 first_episode_render = True for t in range(self.batch_size): if render and first_episode_render: self.env.render() curr_obs = obs action = self.get_action(torch.as_tensor(obs, dtype=torch.float32)) obs, reward, done, _ = self.env.step(action) episode_rewards.append(reward) group_data.append((curr_obs, action, reward, obs, done, I_val)) I_val *= self.gamma if t > 0 and t % self.update_every == 0: for data in group_data: self.update(data) group_data = [] if done: ep_return, ep_len = sum(episode_rewards), len(episode_rewards) batch_returns.append(ep_return) batch_lens.append(ep_len) episode_rewards = [] obs, done = self.env.reset(), False I_val = 1 first_episode_render = False return batch_returns, batch_lens def update(self, data): obs, action, reward, next_obs, done, I_val = data obs = torch.as_tensor([obs], dtype=torch.float32) next_obs = torch.as_tensor([next_obs], dtype=torch.float32) action = torch.as_tensor([action], dtype=torch.float32) reward = torch.as_tensor(reward, dtype=torch.float32) error = self.get_value_error(obs, next_obs, reward, done) self.value_optim.zero_grad() value_loss = self.value_update(obs, error) value_loss.backward() self.value_optim.step() self.policy_optim.zero_grad() policy_loss = self.policy_update(obs, action, error, I_val) policy_loss.backward() self.policy_optim.step() def policy(self, obs): mlp_out = self.policy_mlp(obs) return Categorical(logits=mlp_out) def get_action(self, obs): policy_dist = self.policy(obs) action = policy_dist.sample().item() return action def policy_update(self, obs, action, error, I): policy_dist = self.policy(obs) log_proba = policy_dist.log_prob(action) print(-(error * I * log_proba)) return -(error * I * log_proba) def state_value(self, obs): mlp_out = self.value_mlp(obs) return mlp_out def value_update(self, obs, error): value = self.state_value(obs) return -(error * value) def get_value_error(self, obs, next_obs, reward, done): value = self.state_value(obs).clone().detach() next_value = 0 if done else self.state_value(next_obs).clone().detach() return (reward + self.gamma * next_value - value)