class VGG19_C: def __init__(self): self.covid_path = 'dataset/covid_dataset.csv' self.covid_image_path = 'dataset/covid_adjusted/' self.normal_path = 'dataset/normal_xray_dataset.csv' self.normal_image_path = 'dataset/normal_dataset/' self.head_count = 99 self.test_ratio = 0.15 self.shape = (224, 224, 3) self.folds = 5 self.batch_size = 32 self.epochs = 500 self.verbose = 2 self.activation_optimizer = Adam(lr=0.0001, decay=1e-6) self.early_stop_criteria = EarlyStopping(patience=100, restore_best_weights=True) self.prior_model_path = 'prior_model.h5' def Generate_Model(self, params = 'default'): if params == 'default': shape = self.shape start_generate = datetime.datetime.now() model = tf.keras.Sequential() model.add(Conv2D(64, kernel_size=(3, 3), input_shape=self.shape, activation='relu', padding='same')) model.add(Conv2D(64, kernel_size=(3, 3), input_shape=self.shape, activation='relu', padding='same')) model_to_transfer = self.load_model(self.prior_model_path) self.model_to_transfer = model_to_transfer for i, layer in enumerate(model_to_transfer.layers[0].layers[2:]): model.add(layer) print('model_to_transfer-feedforward') for i, layer in enumerate(model_to_transfer.layers[1:]): model.add(layer) for i, layer in enumerate(model.layers): if i > 1: layer.trainable = False opt = self.activation_optimizer model.compile( loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'] ) train_aug = ImageDataGenerator( rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, horizontal_flip=True ) self.train_aug = train_aug end_generate = datetime.datetime.now() self.generate_time = str(end_generate - start_generate) print('calculation time for ALL: {}'.format(self.generate_time)) return model def Run_Model(self, params = 'default'): if params == 'default': folds = self.folds; batch_size = self.batch_size; epochs = self.epochs; X_train = self.X_train; y_train = self.y_train; X_test = self.X_test; y_test = self.y_test model = self.model; train_aug = self.train_aug lst_perf_folds = [] lst_perf_folds_evaluate = [] lst_perf_folds_history = [] lst_perf_folds_report = [] start_run = datetime.datetime.now() kf = KFold(n_splits = folds, random_state = 1, shuffle = True) kf.get_n_splits(X_train) for fold, (train_index, validation_index) in enumerate(kf.split(X_train)): print('\n Fold %d' % (fold)) start_iter = datetime.datetime.now() X_train_fold, X_validation = X_train[train_index], X_train[validation_index] y_train_fold, y_validation = y_train[train_index], y_train[validation_index] early_stop_criteria = self.early_stop_criteria history = model.fit(train_aug.flow(X_train_fold, y_train_fold, batch_size=batch_size), validation_data=(X_validation, y_validation), validation_steps=len(X_validation) / batch_size, steps_per_epoch=len(X_train_fold) / batch_size, epochs=epochs, verbose=self.verbose, callbacks=[ early_stop_criteria ] ) y_pred_test = model.predict(X_test, batch_size = batch_size) y_pred_train = model.predict(X_train, batch_size = batch_size) y_pred_train_fold = model.predict(X_train_fold, batch_size = batch_size) y_pred_validation = model.predict(X_validation, batch_size = batch_size) rep1 = classification_report(np.argmax(y_test, axis = 1), np.argmax(y_pred_test, axis = 1), output_dict = True) rep2 = classification_report(np.argmax(y_train, axis = 1), np.argmax(y_pred_train, axis = 1), output_dict = True) rep3 = classification_report(np.argmax(y_train_fold, axis = 1), np.argmax(y_pred_train_fold, axis = 1), output_dict = True) rep4 = classification_report(np.argmax(y_validation, axis = 1), np.argmax(y_pred_validation, axis = 1), output_dict = True) lst_perf_folds.append((rep1['accuracy'], rep2['accuracy'], rep3['accuracy'], rep4['accuracy'])) lst_perf_folds_history.append(model.history.history) lst_perf_folds_report.append((rep1, rep2, rep3, rep4)) evaluate_TEST = model.evaluate(X_test, y_test, verbose=0) evaluate_TRAIN = model.evaluate(X_train, y_train, verbose=0) evaluate_TRAIN_Fold = model.evaluate(X_train_fold, y_train_fold, verbose=0) evaluate_VALIDATION = model.evaluate(X_validation, y_validation, verbose=0) lst_perf_folds_evaluate.append((evaluate_TEST, evaluate_TRAIN, evaluate_TRAIN_Fold, evaluate_VALIDATION)) end_iter = datetime.datetime.now() print('calculation time for iteration-{}: {}'.format(str(fold), str(end_iter - start_iter))) mean_Accuracy_TEST = round(np.mean(np.array(lst_perf_folds)[:, 0]), 4) self.mean_Accuracy_TEST = mean_Accuracy_TEST mean_Accuracy_TRAIN = round(np.mean(np.array(lst_perf_folds)[:, 1]), 4) self.mean_Accuracy_TRAIN = mean_Accuracy_TRAIN mean_Accuracy_TRAIN_Fold = round(np.mean(np.array(lst_perf_folds)[:, 2]), 4) mean_Accuracy_VALIDATION = round(np.mean(np.array(lst_perf_folds)[:, 3]), 4) self.mean_Accuracy_VALIDATION = mean_Accuracy_VALIDATION print('Avg-TEST Acc: {} ... Avg-VALIDATION Acc: {}'.format(mean_Accuracy_TEST, mean_Accuracy_VALIDATION)) print('Avg-TRAIN Acc: {} ... Avg-TRAIN_Fold Acc: {}'.format(mean_Accuracy_TRAIN, mean_Accuracy_TRAIN_Fold)) print('lst_perf_folds_evaluate: {}'.format(lst_perf_folds_evaluate)) print('Support Check:') print('TEST: # of Images Normal: {} vs Covid-19: {}'.format(rep1['1']['support'], rep1['0']['support'])) print('VALIDATION: # of Images Normal: {} vs Covid-19: {}'.format(rep4['1']['support'], rep4['0']['support'])) print('TRAIN: # of Images Normal: {} vs Covid-19: {}'.format(rep2['1']['support'], rep2['0']['support'])) print('TRAIN-Fold: # of Images Normal: {} vs Covid-19: {}'.format(rep3['1']['support'], rep3['0']['support'])) end_run = datetime.datetime.now() self.run_time = str(end_run - start_run) print('calculation time for RUN: {}'.format(self.run_time)) print('Finished at {}'.format(datetime.datetime.now())) def Create_DataFrames(self, params = 'default'): if params == 'default': covid_path = self.covid_path; normal_path = self.normal_path; head_count = self.head_count covid_df = pd.read_csv(covid_path, usecols=['filename', 'finding']) normal_df = pd.read_csv(normal_path, usecols=['filename', 'finding']) normal_df = normal_df.head(head_count) self.covid_df = covid_df self.normal_df = normal_df return covid_df, normal_df def Fetch_Images(self, params = 'default' ): if params == 'default': covid_df = self.covid_df; normal_df = self.normal_df covid_images_lst = [] covid_labels = [] covid_image_path = self.covid_image_path normal_image_path = self.normal_image_path for index, row in covid_df.iterrows(): filename = row['filename'] label = row['finding'] path = covid_image_path + filename image = cv2.imread(path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) covid_images_lst.append(image) covid_labels.append(label) normal_images_lst = [] normal_labels = [] for index, row in normal_df.iterrows(): filename = row['filename'] label = row['finding'] path = normal_image_path + filename # temporary fix while we preprocess ALL the images if filename == '4c268764-b5e5-4417-85a3-da52916984d8.jpg': break image = cv2.imread(path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) normal_images_lst.append(image) normal_labels.append(label) # normalize to interval of [0,1] covid_images = np.array(covid_images_lst) / 255 # normalize to interval of [0,1] normal_images = np.array(normal_images_lst) / 255 self.covid_images = covid_images self.normal_images = normal_images self.covid_labels = covid_labels self.normal_labels = normal_labels return covid_images, normal_images, covid_labels, normal_labels def plot_images(self, images, title): nrows, ncols = 10, 10 figsize = [5, 5] fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, facecolor=(1, 1, 1)) for i, axi in enumerate(ax.flat): axi.imshow(images[i]) axi.set_axis_off() plt.suptitle(title, fontsize=24) plt.tight_layout(pad=0.2, rect=[0, 0, 1, 0.9]) plt.show() def Split_Train_Test(self, params = 'default'): if params == 'default': covid_images = self.covid_images; normal_images = self.normal_images covid_labels = self.covid_labels; normal_labels = self.normal_labels; test_ratio = self.test_ratio # covid_images=92, normal_images=99 >> 191 = 152_Train + 39_Test # split into training and testing # , shuffle = True covid_x_train, covid_x_test, covid_y_train, covid_y_test = \ train_test_split(covid_images, covid_labels, test_size = test_ratio) normal_x_train, normal_x_test, normal_y_train, normal_y_test =\ train_test_split(normal_images, normal_labels, test_size = test_ratio) X_train = np.concatenate((normal_x_train, covid_x_train), axis=0) X_test = np.concatenate((normal_x_test, covid_x_test), axis=0) y_train = np.concatenate((normal_y_train, covid_y_train), axis=0) y_test = np.concatenate((normal_y_test, covid_y_test), axis=0) # make labels into categories - either 0 or 1 y_train = LabelBinarizer().fit_transform(y_train) y_train = to_categorical(y_train) y_test = LabelBinarizer().fit_transform(y_test) y_test = to_categorical(y_test) self.X_train = X_train; self.y_train = y_train self.X_test = X_test; self.y_test = y_test return X_train, y_train, X_test, y_test def Time_Stamp(self): date_time = datetime.datetime.now() D = str(date_time.day) M = str(date_time.month) Y = str(date_time.year) h = str(date_time.hour) m = str(date_time.minute) s = str(date_time.second) lst_date = [D, M, Y, h, m, s] return lst_date def FileNameUnique(self, prefix = "Grp16_", suffix = '.csv'): file_name = prefix lst_date = self.Time_Stamp() for idx, i in enumerate(lst_date): if idx == 2: file_name += i + '_' elif idx == 5: file_name += i + suffix else: file_name += i + '.' return file_name def model_parameters(self): list_param_name = ['test_ratio', 'folds', 'batch_size', 'epochs', 'verbose', 'shape', 'activation_optimizer', 'early_stop_criteria', 'covid_path', 'covid_image_path', 'normal_path', 'normal_image_path', 'head_count'] list_param_values = [self.test_ratio, self.folds, self.batch_size, self.epochs, self.verbose, self.shape, self.activation_optimizer, self.early_stop_criteria, self.covid_path, self.covid_image_path, self.normal_path, self.normal_image_path, self.head_count] dict_params = {'parameter': list_param_name, 'value': list_param_values} df_params = pd.DataFrame(dict_params) return df_params def model_parameters_save(self): list_param_name = ['mean_Accuracy_TEST', 'mean_Accuracy_VALIDATION', 'mean_Accuracy_TRAIN', 'Run_Time', 'test_ratio', 'folds', 'batch_size', 'epochs', 'verbose', 'shape', 'activation_opt_keys', 'activation_opt_vals', 'early_stop_criteria', 'covid_path', 'covid_image_path', 'normal_path', 'normal_image_path', 'head_count'] opt_config = opt_config = self.activation_optimizer.get_config() list_optimizer_keys = [ k for k in opt_config ] list_optimizer_values = [ v for v in opt_config.values() ] list_param_values = [self.mean_Accuracy_TEST, self.mean_Accuracy_VALIDATION, self.mean_Accuracy_TRAIN, self.run_time, self.test_ratio, self.folds, self.batch_size, self.epochs, self.verbose, self.shape, list_optimizer_keys, list_optimizer_values, self.early_stop_criteria, self.covid_path, self.covid_image_path, self.normal_path, self.normal_image_path, self.head_count] dict_params = {'parameter': list_param_name, 'value': list_param_values} df_params = pd.DataFrame(dict_params) return df_params def save_model(self, model_t0_save, file_name): save_model(model_t0_save, file_name) print('model saved as: {}'.format(file_name)) def load_model(self, file_name): loaded_model = load_model(file_name) return loaded_model
class VPGSolver(StandardAgent): """ A standard vpg_solver, inpired by: https://github.com/jachiam/rl-intro/blob/master/pg_cartpole.py NOTE: will need to examine steps (total_t), not episodes, as VPG doesn't implement episodes per-training-step """ can_graph = True # batch size is variable, cannot use tf graphing def __init__(self, experiment_name, env_wrapper, gamma=0.99, epsilon=None, epsilon_decay_rate=0.995, epsilon_min=0.1, batch_size=64, n_cycles=128, learning_rate=0.01, model_name="vpg", saving=True): super(VPGSolver, self).__init__( env_wrapper, model_name, experiment_name, saving=saving) self.label = "Batch" # not by episode, by arbitrary batch self.action_size_tensor = tf.constant(self.action_size) self.gamma = gamma self.epsilon = epsilon self.epsilon_decay_rate = epsilon_decay_rate self.epsilon_min = epsilon_min # TODO could go to standard.. self.batch_size = batch_size self.n_cycles = n_cycles self.memory = [] # state self.solved_on = None self.model = self.build_model() self.optimizer = Adam(lr=learning_rate) # decay=learning_rate_decay) self.load_state() # TODO rollout steps @staticmethod def discount_future_cumsum(episode_rewards, gamma): """ Takes: A list of rewards per step for an episode Returns: The future reward at each step, with the future discounting rate applied from that step onwards. """ ep_rwds = np.array(episode_rewards) n = len(ep_rwds) discounts = gamma ** np.arange(n) discounted_futures = np.zeros_like(ep_rwds, dtype=np.float64) for j in range(n): discounted_futures[j] = sum(ep_rwds[j:] * discounts[:(n-j)]) assert len(discounted_futures) == len(episode_rewards) return discounted_futures def solve(self, max_iters, verbose=False, render=False): start_time = datetime.datetime.now() env = self.env_wrapper.env state, done, episode_rewards = env.reset(), False, [] success_steps = 0 for batch_num in range(max_iters): # Refresh every batch (on-policy) state_batch, act_batch, batch_future_rewards = [], [], [] for step in range(self.n_cycles): if render: env.render() action = self.act(self.model, state, epsilon=self.epsilon) state_next, reward, done, _ = env.step(action) # Custom reward if required by env wrapper reward = self.env_wrapper.reward_on_step( state, state_next, reward, done, step) state_batch.append(state.copy()) act_batch.append(np.int32(action)) episode_rewards.append(reward) # NOTE: Removed copy state = state_next self.report_step(step, batch_num, max_iters) if done: # At the end of each episode: # Create a list of future rewards, # discounting by how far in the future batch_future_rewards += list( self.discount_future_cumsum( episode_rewards, self.gamma)) self.scores.append(success_steps) state, done, episode_rewards = env.reset(), False, [] success_steps = 0 else: success_steps += 1 # Add any trailing rewards to done batch_future_rewards += list( self.discount_future_cumsum( episode_rewards, self.gamma) ) episode_rewards = [] # HANDLE END OF EPISODE batch_advs = np.array(batch_future_rewards) # This is R(tau), normalised normalised_batch_advs = ( (batch_advs - np.mean(batch_advs)) / (np.std(batch_advs) + 1e-8) ) self.remember(state_batch, act_batch, normalised_batch_advs) self.learn(*self.get_batch_to_train()) solved = self.handle_episode_end( state, state_next, reward, step, max_iters, verbose=verbose) if solved: break self.elapsed_time += (datetime.datetime.now() - start_time) return solved def remember(self, state_batch, act_batch, batch_advs): self.memory = (state_batch, act_batch, batch_advs) def get_batch_to_train(self): assert len(self.memory[0]) == len(self.memory[1]), f"{len(self.memory[0])}, {len(self.memory[1])}" assert len(self.memory[1]) == len(self.memory[2]), f"{len(self.memory[1])}, {len(self.memory[2])}" minibatch_i = np.random.choice(len(self.memory[0]), min(self.batch_size, len(self.memory[0])), ) sampled_memory = [] for i in range(len(self.memory)): sampled_memory.append(tf.convert_to_tensor([self.memory[i][j] for j in minibatch_i])) self.memory = [] # Only learning from last set of trajectories return sampled_memory def learn(self, sts, acts, advs): """Updated the agent's decision network based on a sample of previous decisions it has seen. Here, we combine the target and action networks. """ loss_value = self.take_training_step(sts, acts, advs) if self.epsilon: if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay_rate return loss_value @conditional_decorator(tf.function, can_graph) def take_training_step(self, sts, acts, advs): tf.debugging.assert_equal(tf.shape(sts)[0], tf.size(acts), summarize=1) tf.debugging.assert_equal(tf.size(acts), tf.size(advs), summarize=1) with tf.GradientTape() as tape: # One step away from Pi_theta(at|st) pi_action_logits = self.model(sts) action_one_hots = tf.one_hot( acts, self.action_size_tensor, dtype=tf.float64) # This IS pi_theta(at|st), only at the actual action taken pi_action_log_probs = tf.math.reduce_sum( action_one_hots * tf.nn.log_softmax(pi_action_logits), axis=1) tf.debugging.assert_equal(tf.size(advs), tf.size(pi_action_log_probs)) loss_value = - tf.math.reduce_mean( advs * pi_action_log_probs ) grads = tape.gradient(loss_value, self.model.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) return loss_value def save_state(self, add_to_save={}): """Save a (trained) model with its weights to a specified file. Metadata should be passed to keep information avaialble. """ self.save_state_to_dict(append_dict={ "optimizer_config": self.optimizer.get_config(), "epislon": self.epsilon, }) self.model.save(self.model_location) def load_state(self): """Load a model with the specified name""" model_dict = self.load_state_from_dict() print("Loading weights from", self.model_location + "...", end="") if os.path.exists(self.model_location): self.model = tf.keras.models.load_model(self.model_location) self.optimizer = self.optimizer.from_config(self.optimizer_config) del model_dict["optimizer_config"], self.optimizer_config print(" Loaded.") else: print(" Model not yet saved at loaction.") if "memory" in model_dict: del model_dict["memory"] print("Loaded state:") pprint.pprint(model_dict, depth=1)
class PPOSolver(StandardAgent): """ PPO Solver Inspired by: https://github.com/anita-hu/TF2-RL/blob/master/PPO/TF2_PPO.py https://github.com/ajleite/basic-ppo/blob/master/ppo.py """ can_graph = True def __init__(self, experiment_name, env_wrapper, clip_ratio=0.2, val_coef=1.0, entropy_coef=0.01, lam=1.0, gamma=0.95, actors=1, cycle_length=128, minibatch_size_per_actor=64, cycle_epochs=4, learning_rate=5e-4, model_name="ppo", saving=True): super(PPOSolver, self).__init__( env_wrapper, model_name, experiment_name, saving=saving) self.clip_ratio = clip_ratio self.gamma = gamma self.lam = lam self.val_coef = val_coef self.entropy_coef = entropy_coef self.actors = actors self.cycle_length = cycle_length # Run this many per epoch self.batch_size = cycle_length * actors # Sample from the memory self.minibatch_size = minibatch_size_per_actor * actors # train on batch self.cycle_epochs = cycle_epochs # Train for this many epochs # self.num_init_random_rollouts = num_init_random_rollouts self.model_name = model_name self.solved_on = None self.model = PPOModel( self.state_size, self.action_size, model_name=self.model_name) self.model.build(input_shape=(None, self.state_size)) # self._random_dataset = self._gather_rollouts( # env_wrapper, num_init_random_rollouts, epsilon=1.) self.optimizer = Adam(lr=learning_rate) head, _, _ = self.model_location.rpartition(".h5") self.model_location = head + ".weights" self.load_state() def show(self, render=False): raise NotImplementedError("self.model needs to be adapted in super") def solve(self, max_iters, verbose=False, render=False): start_time = datetime.datetime.now() env_trackers = [EnvTracker(self.env_wrapper) for _ in range(self.actors)] solved = False # Every episode return ever all_episode_returns = [] all_episode_steps = [] for iteration in range(max_iters): data = [] # Refresh every batch (on-policy) for env_tracker in env_trackers: state = env_tracker.latest_state states, actions, log_probs, rewards, v_preds =\ [], [], [], [], [] for step in range(self.cycle_length): if render: env_tracker.env.render() action, value, log_prob = ( tf.squeeze(x).numpy() for x in self.model.act_value_logprobs( state, eps=None) ) observation, reward, done, _ = env_tracker.env.step(action) state_next = observation # Custom reward if required by env wrapper reward = self.env_wrapper.reward_on_step( state, state_next, reward, done, step) env_tracker.return_so_far += reward states.append(state) actions.append(action) log_probs.append(log_prob) rewards.append(np.float64(reward)) v_preds.append(value) self.report_step(step, iteration, max_iters) if done: all_episode_returns.append( env_tracker.return_so_far) all_episode_steps.append(env_tracker.steps_so_far) state = env_tracker.env.reset() env_tracker.steps_so_far = 0 env_tracker.return_so_far = 0. else: env_tracker.steps_so_far += 1 state = observation next_v_preds = v_preds[1:] + [0.] # TODO - both right float? gaes = self.get_norm_general_advantage_est( rewards, v_preds, next_v_preds) # TODO make a handler object if not data: data = [ states, actions, log_probs, next_v_preds, rewards, gaes ] else: data[0] += states; data[1] += actions; data[2] += log_probs data[3] += next_v_preds; data[4] += rewards; data[5] += gaes env_tracker.latest_state = state self.scores = all_episode_steps # FIXME this won't handle picking up from left-off solved = self.handle_episode_end( state, state_next, reward, step, max_iters, verbose=verbose) if solved: break self.take_training_step( *(tf.convert_to_tensor(lst) for lst in data) # *tuple(map(tf.convert_to_tensor, zip(*memory))) ) self.elapsed_time += (datetime.datetime.now() - start_time) return solved def get_norm_general_advantage_est(self, rewards, v_preds, next_v_preds): # Sources: # https://github.com/uidilr/ppo_tf/blob/master/ppo.py#L98 # https://github.com/anita-hu/TF2-RL/blob/master/PPO/TF2_PPO.py deltas = [ r_t + self.gamma * v_next - v for r_t, v_next, v in zip(rewards, next_v_preds, v_preds) ] gaes = copy.deepcopy(deltas) for t in reversed(range(len(gaes) - 1)): gaes[t] = gaes[t] + self.lam * self.gamma * gaes[t + 1] gaes = np.array(gaes).astype(np.float64) norm_gaes = (gaes - gaes.mean()) / gaes.std() return norm_gaes @conditional_decorator(tf.function, can_graph) def take_training_step(self, sts, a, log_p, nxt_v_pred, r, adv): """ Performs gradient DEscent on minibatches of minibatch_size, sampled from a batch of batch_size, sampled from the memory Samples without replacement (to check) """ assert self.batch_size == len(r) for _ in range(self.cycle_epochs): # Batch from the examples in the memory shuffled_indices = tf.random.shuffle(tf.range(self.batch_size)) # Every index of the cycle examples num_mb = self.batch_size // self.minibatch_size # Pick minibatch-sized samples from there for minibatch_i in tf.split(shuffled_indices, num_mb): minibatch = ( tf.gather(x, minibatch_i, axis=0) for x in (sts, a, log_p, nxt_v_pred, r, adv) ) self.train_minibatch(*minibatch) # TODO used to be zip weights and assign # for pi_old_w, pi_w in zip( # self.pi_model_old.weights, self.pi_model.weights): # pi_old_w.assign(pi_w) @conditional_decorator(tf.function, can_graph) def train_minibatch(self, sts, a, log_p, nxt_v_pred, r, adv): # Convert from (64,) to (64, 1) r = tf.expand_dims(r, axis=-1) nxt_v_pred = tf.expand_dims(nxt_v_pred, axis=-1) with tf.GradientTape() as tape: new_log_p, entropy, sts_vals = self.model.evaluate_actions(sts, a) ratios = tf.exp(new_log_p - log_p) clipped_ratios = tf.clip_by_value( ratios, clip_value_min=1-self.clip_ratio, clip_value_max=1+self.clip_ratio ) loss_clip = tf.reduce_mean( tf.minimum((adv * ratios), (adv * clipped_ratios)) ) target_values = r + self.gamma * nxt_v_pred vf_loss = tf.reduce_mean( tf.math.square(sts_vals - target_values) ) entropy = tf.reduce_mean(entropy) total_loss = ( - loss_clip + self.val_coef * vf_loss - self.entropy_coef * entropy ) train_variables = self.model.trainable_variables grads = tape.gradient(total_loss, train_variables) self.optimizer.apply_gradients(zip(grads, train_variables)) def save_state(self, verbose=False): """ Called at the end of saving-episodes. Save a (trained) model with its weights to a specified file. Passes the required information to add to the pickle dict for the model. """ add_to_save = { # "epsilon": self.epsilon, # "memory": self.memory, "optimizer_config": self.optimizer.get_config(), } self.save_state_to_dict(append_dict=add_to_save) if verbose: print("Saving to", self.model_location) self.model.save_weights(self.model_location) # , save_format='tf') def load_state(self): """Load a model with the specified name""" model_dict = self.load_state_from_dict() print("Loading weights from", self.model_location + "...", end="") if os.path.exists(self.model_location): self.model.load_weights(self.model_location) self.optimizer = self.optimizer.from_config(self.optimizer_config) del model_dict["optimizer_config"], self.optimizer_config print(" Loaded.") else: print(" Model not yet saved at loaction.") if "memory" in model_dict: del model_dict["memory"] print("Loaded state:") pprint.pprint(model_dict, depth=1)
class DDPGSolver(StandardAgent): """ A standard ddpg solver: https://github.com/openai/baselines/blob/master/baselines/a2c/a2c.py Inspired by https://github.com/anita-hu/TF2-RL/blob/master/DDPG/TF2_DDPG_Basic.py """ def __init__( self, experiment_name, env_wrapper, ent_coef=1e-4, vf_coef=0.5, n_cycles=128, batch_size=64, max_grad_norm=0.5, learning_rate_actor=1e-5, learning_rate_critic=1e-3, memory_len=100000, gamma=0.99, epsilon=None, tau=0.125, lrschedule='linear', model_name="ddpg", saving=True, rollout_steps=5000, ): super(DDPGSolver, self).__init__(env_wrapper, model_name, experiment_name, saving=saving) self.n_cycles = n_cycles self.batch_size = batch_size self.gamma = gamma self.ent_coef = ent_coef self.vf_coef = vf_coef # NOTE new AND need to verify deque is safe self.memory = deque(maxlen=memory_len) self.epsilon = epsilon # new but should be in A2C self.tau = tau # TODO reimplement # self.max_grad_norm = max_grad_norm # self.epsilon = epsilon # exploration rate self.solved_on = None self.actor = self.build_model(model_name=model_name + "_actor") self.actor.build(input_shape=( None, self.state_size, )) self.actor_dash = self.build_model(model_name=model_name + "_actor_target") self.actor_dash.build(input_shape=( None, self.state_size, )) self.actor_dash.set_weights(self.actor.get_weights()) self.actor_optimizer = Adam(learning_rate=learning_rate_actor) self.actor.summary() self.critic = self.build_critic_model(self.state_size, self.action_size, model_name=model_name + "_critic") # self.critic.build(input_shape=[(state_size,), (action_size,)]) self.critic_dash = self.build_critic_model(self.state_size, self.action_size, model_name=model_name + "_critic_target") # self.critic_dash.build(input_shape=[(state_size,), (action_size,)]) self.critic_dash.set_weights(self.critic.get_weights()) self.critic_optimizer = Adam(learning_rate=learning_rate_critic) self.critic.summary() self.load_state() self.rollout_memory(rollout_steps - len(self.memory)) def build_critic_model(self, input_size, action_size, model_name='critic'): """ Returns Q(st+1 | a, s) """ inputs = [Input(shape=(input_size)), Input(shape=(action_size, ))] concat = Concatenate(axis=-1)(inputs) x = Dense(24, name="hidden_1", activation='tanh')(concat) x = Dense(48, name="hidden_2", activation='tanh')(x) output = Dense(1, name="Out")(x) model = Model(inputs=inputs, outputs=output, name=model_name) model.build(input_shape=[(input_size, ), (action_size, )]) return model def act_with_noise(self, state, add_noise=True): raise NotImplementedError( "Consider implementing from\nhttps://github.com/anita-hu/" "TF2-RL/blob/master/DDPG/TF2_DDPG_Basic.py") def show(self, render=False): raise NotImplementedError("self.model needs to be adapted in super") def solve(self, max_iters, verbose=False, render=False): start_time = datetime.datetime.now() env = self.env_wrapper.env state = env.reset() success_steps = 0 for iteration in range(max_iters): for step in range(self.n_cycles): # itertools.count(): if render: env.render() # TODO implement act and add noise action_dist = self.actor(tf.expand_dims(state, axis=0)) observation, reward, done, _ = env.step(np.argmax(action_dist)) # Custom reward if required by env wrapper reward = self.env_wrapper.reward_on_step( state, observation, reward, done, step) self.memory.append((state, tf.squeeze(action_dist), np.float64(reward), observation, done)) state = observation self.report_step(step, iteration, max_iters) if done: # OR env_wrapper.get_score(state, observation, reward, step) self.scores.append(success_steps) success_steps = 0 state = env.reset() else: success_steps += 1 self.take_training_step() solved = self.handle_episode_end(state, observation, reward, step, max_iters, verbose=verbose) if solved: break self.elapsed_time += (datetime.datetime.now() - start_time) return solved def take_training_step(self): if len(self.memory) < self.batch_size: return # Note min is actually unecessary with cond above minibatch_i = np.random.choice( len(self.memory), min(self.batch_size, len(self.memory)), ) minibatch = [self.memory[i] for i in minibatch_i] # Obs on [adv, return] loss_value = self.train_on_minibatch( *tuple(map(tf.convert_to_tensor, zip(*minibatch)))) # Update weights for model_name in "actor", "critic": self.update_weights(model_name, self.tau) # TODO decrease epsilon if not None @tf.function() def train_on_minibatch(self, sts, a, r, n_sts, d): # r + gam(1-d)Q_phi_targ(s_t+1, mu_theta_targ(s_t+1)) n_a = self.actor_dash(n_sts) q_future_pred = self.critic_dash([n_sts, n_a]) target_qs = r + tf.where( d, tf.zeros(shape=q_future_pred.shape, dtype=tf.dtypes.float64), self.gamma * q_future_pred) # Minimise (r + target on next state) - (current critic on sts and a) # Makes critic better at predicting future with tf.GradientTape() as tape: updated_q_values = self.critic([sts, a]) critic_loss = tf.reduce_mean( tf.math.square(updated_q_values - target_qs)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) # Use the (improving) critic to rate the actor's updated decision # Minimising loss means maximising actor's expectation with tf.GradientTape() as tape: # mu_phi(s) updated_action_dist = self.actor(sts) # Works due to chain rule, tracks mu gradients to improve mu prediciton # TODO this is quite nuanced - check this through actor_loss = -tf.reduce_mean( self.critic([sts, updated_action_dist])) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) def update_weights(self, model_name, tau): weights = getattr(getattr(self, model_name), "weights") target_model = getattr(self, model_name + "_dash") target_weights = target_model.weights target_model.set_weights([ weights[i] * tau + target_weights[i] * (1. - tau) for i in range(len(weights)) ]) def save_state(self): """ Called at the end of saving-episodes. Save a (trained) model with its weights to a specified file. Passes the required information to add to the pickle dict for the model. """ add_to_save = { "memory": self.memory, "epsilon": self.epsilon, "actor_optimizer_config": self.actor_optimizer.get_config(), "critic_optimizer_config": self.critic_optimizer.get_config(), } self.save_state_to_dict(append_dict=add_to_save) for var in ("actor", "actor_dash", "critic", "critic_dash"): model = getattr(self, var) model.save_weights( self.model_location.replace(".h5", "_" + var + ".h5")) def load_state(self): """Load a model with the specified name""" model_dict = self.load_state_from_dict() print("Loading weights from", self.model_location + "...", end="") if os.path.exists(self.model_location): for var in ("actor", "actor_dash", "critic", "critic_dash"): model = getattr(self, var) self.model.load_weights( self.model_location.replace(".h5", "_" + var + ".h5")) self.actor_optimizer = self.actor_optimizer.from_config( self.actor_optimizer_config) self.critic_optimizer = self.critic_optimizer.from_config( self.critic_optimizer_config) del model_dict[ "actor_optimizer_config"], self.actor_optimizer_config del model_dict[ "critic_optimizer_config"], self.critic_optimizer_config print(" Loaded.") else: print(" Model not yet saved at loaction.") if "memory" in model_dict: del model_dict["memory"] print("Loaded state:") pprint.pprint(model_dict, depth=1) def rollout_memory(self, rollout_steps, render=False): if rollout_steps <= 0: return print("Rolling out steps", rollout_steps) env = self.env_wrapper.env state = env.reset() max_iters = rollout_steps // self.n_cycles for iteration in range(max_iters): for step in range(self.n_cycles): if render: env.render() # TODO implement act and add noise action_dist = self.actor(tf.expand_dims(state, axis=0)) observation, reward, done, _ = env.step(np.argmax(action_dist)) # Custom reward if required by env wrapper reward = self.env_wrapper.reward_on_step( state, observation, reward, done, step) self.memory.append((state, tf.squeeze(action_dist), np.float64(reward), observation, done)) state = observation self.report_step(step, iteration, max_iters) if done: state = env.reset() print("\nCompleted.")
class DQNSolver(StandardAgent): """ A standard dqn_solver, inpired by: https://gym.openai.com/evaluations/eval_EIcM1ZBnQW2LBaFN6FY65g/ Implements a simple DNN that predicts values. """ def __init__(self, experiment_name, env_wrapper, memory_len=100000, gamma=0.99, batch_size=64, n_cycles=128, epsilon=1., epsilon_min=0.01, epsilon_decay=0.995, learning_rate=0.01, learning_rate_decay=0.01, rollout_steps=10000, model_name="dqn", saving=True): super(DQNSolver, self).__init__(env_wrapper, model_name, experiment_name, saving=saving) # Training self.batch_size = batch_size self.n_cycles = n_cycles self.memory = deque(maxlen=memory_len) self.solved_on = None self.gamma = gamma # discount rate was 1 self.epsilon = epsilon # exploration rate self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay # 0.995 self.model = self.build_model() self.optimizer = Adam(lr=learning_rate, decay=learning_rate_decay) self.load_state() self.rollout_memory(rollout_steps - len(self.memory)) def rollout_memory(self, rollout_steps, verbose=False, render=False): if rollout_steps <= 0: return env = self.env_wrapper.env state = env.reset() for step in range(rollout_steps): if render: env.render() action = self.act(self.model, state, epsilon=1.) # Max random observation, reward, done, _ = env.step(action) state_next = observation # Custom reward if required by env wrapper reward = self.env_wrapper.reward_on_step(state, state_next, reward, done, step) self.memory.append( (state, np.int32(action), reward, state_next, done)) state = observation if done: state = env.reset() # OR env_wrapper.get_score(state, state_next, reward, step) print(f"Rolled out {len(self.memory)}") def solve(self, max_iters, verbose=False, render=False): start_time = datetime.datetime.now() env = self.env_wrapper.env state = env.reset() success_steps = 0 for iteration in range(max_iters): for step in range(self.n_cycles): if render: env.render() action = self.act(self.model, state, epsilon=self.epsilon) observation, reward, done, _ = env.step(action) state_next = observation # Custom reward if required by env wrapper reward = self.env_wrapper.reward_on_step( state, state_next, reward, done, step) self.memory.append( (state, np.int32(action), reward, state_next, done)) state = observation self.report_step(step, iteration, max_iters) if done: state = env.reset() # OR env_wrapper.get_score(state, state_next, reward, step) self.scores.append(success_steps) success_steps = 0 else: success_steps += 1 self.learn() score = step solved = self.handle_episode_end(state, state_next, reward, step, max_iters, verbose=verbose) if solved: break self.elapsed_time += (datetime.datetime.now() - start_time) return solved def learn(self): """ Updated the agent's decision network based on a sample of previous decisions it has seen. Here, we combine the target and action networks. """ if len(self.memory) < self.batch_size: return args_as_tuple = get_batch_from_memory(self.memory, self.batch_size) loss_value = self.take_training_step(*args_as_tuple) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay @tf.function def take_training_step(self, sts, a, r, n_sts, d): future_q_pred = tf.math.reduce_max(self.model(n_sts), axis=-1) future_q_pred = tf.where(d, tf.zeros((1, ), dtype=tf.dtypes.float64), future_q_pred) q_targets = tf.cast(r, tf.float64) + self.gamma * future_q_pred loss_value, grads = self.squared_diff_loss_at_a(sts, a, q_targets) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) return loss_value @tf.function def squared_diff_loss_at_a(self, sts, a, q_next): """ A squared difference loss function Diffs the Q model's predicted values for a state with the actual reward + predicted values for the next state """ with tf.GradientTape() as tape: q_s = self.model(sts) # Q(st) # Take only predicted value of the action taken for Q(st|at) gather_indices = tf.range(a.shape[0]) * tf.shape(q_s)[-1] + a q_s_a = tf.gather(tf.reshape(q_s, [-1]), gather_indices) # Q(st|at) diff Q(st+1) losses = tf.math.squared_difference(q_s_a, q_next) reduced_loss = tf.math.reduce_mean(losses) return (reduced_loss, tape.gradient(reduced_loss, self.model.trainable_variables)) def save_state(self): """ Called at the end of saving-episodes. Save a (trained) model with its weights to a specified file. Passes the required information to add to the pickle dict for the model. """ add_to_save = { "epsilon": self.epsilon, "memory": self.memory, "optimizer_config": self.optimizer.get_config(), } self.save_state_to_dict(append_dict=add_to_save) self.model.save(self.model_location) def load_state(self): """Load a model with the specified name""" model_dict = self.load_state_from_dict() print("Loading weights from", self.model_location + "...", end="") if os.path.exists(self.model_location): self.model = tf.keras.models.load_model(self.model_location) self.optimizer = self.optimizer.from_config(self.optimizer_config) del model_dict["optimizer_config"], self.optimizer_config print(" Loaded.") else: print(" Model not yet saved at loaction.") if "memory" in model_dict: del model_dict["memory"] print("Loaded state:") pprint.pprint(model_dict, depth=1)