def reward_value_constrained(v_log_counts_future, v_log_counts_old,\ KQ_f_new, KQ_r_new, E_Regulation_new, E_Regulation_old): final_reward = 0.0 reward_s = reward_intermediate(v_log_counts_future, v_log_counts_old) #originally, does nothing, but turns to penalty value when regulating a new reaction psi = 1.0 #reward_s = e_val_old-e_val_future num_regulated_new = np.sum(E_Regulation_new == 1) num_regulated_old = np.sum(E_Regulation_old == 1) if (num_regulated_new != num_regulated_old): #then you regulated a new reaction: psi = penalty_reward_scalar if ((reward_s < 0.0)): final_reward = penalty_exclusion_reward if (reward_s >= 0.0): final_reward = psi * reward_s #if negative (-0.01) -> take fastest path #if positive (0.01) -> take slowest path if ((np.max(v_log_counts_future - target_v_log_counts) <= 0.0)): #The final reward is meant to maximize the EPR value. However, there was some residual error in ds_metab #that must be taken into account. We therefore add the last reward_s to the EPR value. epr_future = max_entropy_functions.entropy_production_rate( KQ_f_new, KQ_r_new, E_Regulation_new) final_reward = (1.0) * epr_future + psi * reward_s return final_reward
def reward_value(v_log_counts_future, v_log_counts_old,\ KQ_f_new, KQ_r_new, E_Regulation_new, E_Regulation_old): final_reward=0.0 #here we use the mean for the scaling. The logic is as follows: #https://www.xarg.org/2016/06/the-log-sum-exp-trick-in-machine-learning/ scale_old_max = np.max(v_log_counts_old - target_v_log_counts) scale_old_min = np.min(v_log_counts_old - target_v_log_counts) scale_old = (scale_old_max + scale_old_min)/2.0 e_val_old = np.exp(v_log_counts_old - target_v_log_counts - scale_old) e_val_old = scale_old + np.log(np.sum(e_val_old)) scale_future_max = np.max(v_log_counts_future - target_v_log_counts) scale_future_min = np.min(v_log_counts_future - target_v_log_counts) scale_future = (scale_future_max + scale_future_min)/2.0 e_val_future = np.exp(v_log_counts_future - target_v_log_counts - scale_future) e_val_future = scale_future + np.log(np.sum(e_val_future)) reward_s = (e_val_old - e_val_future) final_reward=reward_s if (( scale_future_max <=0.0)): #The final reward is meant to maximize the EPR value. However, there was some residual error #that must be taken into account. We therefore add the last reward_s to the EPR value. epr_future = max_entropy_functions.entropy_production_rate(KQ_f_new, KQ_r_new, E_Regulation_new) final_reward = 1.0 * epr_future + reward_s return final_reward
def reward_value(v_log_counts_future, v_log_counts_old,\ KQ_f_new, KQ_r_new, E_Regulation_new, E_Regulation_old): final_reward = 0.0 reward_s = reward_intermediate(v_log_counts_future, v_log_counts_old) final_reward = reward_s if ((np.max(v_log_counts_future - target_v_log_counts) <= 0.0)): #The final reward is meant to maximize the EPR value. However, there was some residual error in ds_metab #that must be taken into account. We therefore add the last reward_s to the EPR value. epr_future = max_entropy_functions.entropy_production_rate( KQ_f_new, KQ_r_new, E_Regulation_new) final_reward = (1.0) * epr_future + reward_s return final_reward
def sarsa_n(nn_model, loss_fn, optimizer, scheduler, state_sample, n_back_step, epsilon_greedy): total_time_cpu = 0 total_time_nn = 0 #reset for each episode. policy will add random_steps_taken = 0 nn_steps_taken = 0 final_state = [] final_KQ_f = [] final_KQ_r = [] reached_terminal_state = False average_loss = [] final_reward = 0 sum_reward_episode = 0 end_of_path = 5000 #this is the maximum length a path can take KQ_f_matrix = np.zeros(shape=(num_rxns, end_of_path + 1)) KQ_r_matrix = np.zeros(shape=(num_rxns, end_of_path + 1)) states_matrix = np.zeros(shape=(num_rxns, end_of_path + 1)) delta_S_metab_matrix = np.zeros(shape=(nvar, end_of_path + 1)) v_log_counts_matrix = np.zeros(shape=(nvar, end_of_path + 1)) states_matrix[:, 0] = state_sample res_lsq = least_squares(max_entropy_functions.derivatives, v_log_counts_static, method='lm', xtol=1e-15, args=(f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0])) v_log_counts_matrix[:, 0] = res_lsq.x.copy() log_metabolites = np.append(v_log_counts_matrix[:, 0], f_log_counts) rxn_flux_init = max_entropy_functions.oddsDiff( v_log_counts_matrix[:, 0], f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0]) KQ_f_matrix[:, 0] = max_entropy_functions.odds( log_metabolites, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant) Keq_inverse = np.power(Keq_constant, -1) KQ_r_matrix[:, 0] = max_entropy_functions.odds( log_metabolites, mu0, -S_mat, P_mat, R_back_mat, delta_increment_for_small_concs, Keq_inverse, -1) delta_S_metab_matrix[:, 0] = max_entropy_functions.calc_deltaS_metab( v_log_counts_matrix[:, 0], target_v_log_counts) reward_vec = np.zeros(end_of_path + 1) reward_vec[0] = 0.0 rxn_flux_path = rxn_flux_init.copy() for t in range(0, end_of_path): if (t < end_of_path): #This represents the choice from the current policy. [React_Choice,reward_vec[t+1],\ KQ_f_matrix[:,t+1], KQ_r_matrix[:,t+1],\ v_log_counts_matrix[:,t+1],\ states_matrix[:,t+1],\ delta_S_metab_matrix[:,t+1],\ used_random_step,time_cpu,time_nn] = policy_function(nn_model, states_matrix[:,t], v_log_counts_matrix[:,t], epsilon_greedy)#regulate each reaction. total_time_cpu += time_cpu total_time_nn += time_nn if (used_random_step): random_steps_taken += 1 else: nn_steps_taken += 1 if (React_Choice == -1): print("bad reaction choice, using action = -1") break rxn_flux_path = max_entropy_functions.oddsDiff( v_log_counts_matrix[:, t + 1], f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, t + 1]) if (np.max(rxn_flux_path) < 1.0): print("draining flux") break epr_path = max_entropy_functions.entropy_production_rate( KQ_f_matrix[:, t + 1], KQ_r_matrix[:, t + 1], states_matrix[:, t + 1]) sum_reward_episode += reward_vec[t + 1] current_state = states_matrix[:, t + 1].copy() #We stop the path if we have no more positive loss function values, or if we revisit a state. if ((delta_S_metab_matrix[:, t + 1] <= 0.0).all()): end_of_path = t + 1 #stops simulation at step t+1 reached_terminal_state = True final_state = states_matrix[:, t + 1].copy() final_KQ_f = KQ_f_matrix[:, t + 1].copy() final_KQ_r = KQ_r_matrix[:, t + 1].copy() final_reward = epr_path print( "**************************************Path Length ds<0******************************************" ) print(end_of_path) print("Final STATE") print(states_matrix[:, t + 1]) print(rxn_flux_path) print("original epr") print(epr_path) print("all rewards") print(reward_vec[0:t + 1]) ##BEGIN LEARNING tau = t - n_back_step + 1 if (tau >= 0): #breakpoint() estimate_value = torch.zeros(1, device=device) for i in range(tau + 1, min(tau + n_back_step, end_of_path) + 1): estimate_value += (gamma**(i - tau - 1)) * reward_vec[i] if ((tau + n_back_step) < end_of_path): begin_nn = time.time() value_tau_n = state_value( nn_model, torch.from_numpy( states_matrix[:, tau + n_back_step]).float().to(device)) end_nn = time.time() total_time_nn += end_nn - begin_nn estimate_value += (gamma**(n_back_step)) * value_tau_n begin_nn = time.time() value_tau = state_value( nn_model, torch.from_numpy(states_matrix[:, tau]).float().to(device)) end_nn = time.time() total_time_nn += end_nn - begin_nn if (value_tau.requires_grad == False): breakpoint() if (estimate_value.requires_grad == True): estimate_value.detach_() #WARNING #loss ordering should be input with requires_grad == True, #followed by target with requires_grad == False #breakpoint() begin_nn = time.time() loss = loss_fn(value_tau, estimate_value) #MSE optimizer.zero_grad() loss.backward() clipping_value = 1.0 torch.nn.utils.clip_grad_norm_(nn_model.parameters(), clipping_value) optimizer.step() end_nn = time.time() total_time_nn += end_nn - begin_nn average_loss.append(loss.item()) if (tau >= (end_of_path - 1)): break #after episode is finished, take average loss average_loss_episode = np.mean(average_loss) print("index of max error on path") print(average_loss.index(max(average_loss))) return [sum_reward_episode, average_loss_episode,max(average_loss),final_reward, final_state, final_KQ_f,final_KQ_r,\ reached_terminal_state, random_steps_taken,nn_steps_taken]
#make calculations to regulate rxn_flux = max_entropy_functions.oddsDiff(v_log_counts, f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, E_regulation) KQ_f = max_entropy_functions.odds(log_metabolites, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant) Keq_inverse = np.power(Keq_constant, -1) KQ_r = max_entropy_functions.odds(log_metabolites, mu0, -S_mat, P_mat, R_back_mat, delta_increment_for_small_concs, Keq_inverse, -1) epr = max_entropy_functions.entropy_production_rate( KQ_f, KQ_r, E_regulation) delta_S_metab = max_entropy_functions.calc_deltaS_metab( v_log_counts, target_v_log_counts) delta_S = max_entropy_functions.calc_deltaS(v_log_counts, target_v_log_counts, f_log_counts, S_mat, KQ_f) [RR, Jac] = max_entropy_functions.calc_Jac2(v_log_counts, f_log_counts, S_mat, delta_increment_for_small_concs, KQ_f, KQ_r, E_regulation) A = max_entropy_functions.calc_A(v_log_counts, f_log_counts, S_mat, Jac, E_regulation)
def sarsa_n(nn_model, loss_fn, optimizer, scheduler, state_sample, n_back_step, epsilon_greedy): #reset for each episode. policy will add random_steps_taken = 0 nn_steps_taken = 0 maximum_predicted_value = 0 layer_weight = torch.zeros(1, device=device) final_state = [] final_KQ_f = [] final_KQ_r = [] reached_terminal_state = False average_loss = [] final_reward = 0 sum_reward_episode = 0 end_of_path = 1000 #this is the maximum length a path can take states_matrix = np.zeros(shape=(num_rxns, end_of_path + 1)) states_matrix[:, 0] = state_sample res_lsq = least_squares(max_entropy_functions.derivatives, v_log_counts_static, method=Method1, bounds=(-500, 500), xtol=1e-15, args=(f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0])) if (res_lsq.success == False): res_lsq = least_squares(max_entropy_functions.derivatives, v_log_counts_static, method=Method2, xtol=1e-15, args=(f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0])) if (res_lsq.success == False): res_lsq = least_squares(max_entropy_functions.derivatives, v_log_counts_static, method=Method3, xtol=1e-15, args=(f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0])) v_log_counts_current = res_lsq.x.copy() log_metabolites = np.append(v_log_counts_current, f_log_counts) rxn_flux_init = max_entropy_functions.oddsDiff( v_log_counts_current, f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0]) KQ_f_current = max_entropy_functions.odds(log_metabolites, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant) Keq_inverse = np.power(Keq_constant, -1) KQ_r_current = max_entropy_functions.odds(log_metabolites, mu0, -S_mat, P_mat, R_back_mat, delta_increment_for_small_concs, Keq_inverse, -1) delta_S_metab_current = max_entropy_functions.calc_deltaS_metab( v_log_counts_current, target_v_log_counts) #[ccc,fcc] = max_entropy_functions.conc_flux_control_coeff(nvar, A_init, S_mat, rxn_flux_init, RR) reward_vec = np.zeros(end_of_path + 1) reward_vec[0] = 0.0 rxn_flux_path = rxn_flux_init.copy() #A_path = A_init.copy() for t in range(0, end_of_path): if (t < end_of_path): #This represents the choice from the current policy. [React_Choice,reward_vec[t+1],\ KQ_f_current, KQ_r_current,\ v_log_counts_current,\ states_matrix[:,t+1],\ delta_S_metab_current,\ used_random_step] = policy_function(nn_model, states_matrix[:,t], v_log_counts_current, epsilon_greedy)#regulate each reaction. if (used_random_step): random_steps_taken += 1 else: nn_steps_taken += 1 if (React_Choice == -1): print("out of rewards, final state") print(states_matrix[:, t + 1]) break rxn_flux_path = max_entropy_functions.oddsDiff( v_log_counts_current, f_log_counts, mu0, S_mat, R_back_mat, P_mat, delta_increment_for_small_concs, Keq_constant, states_matrix[:, t + 1]) epr_path = max_entropy_functions.entropy_production_rate( KQ_f_current, KQ_r_current, states_matrix[:, t + 1]) sum_reward_episode += reward_vec[t + 1] final_state = states_matrix[:, t + 1].copy() #We stop the path if we have no more positive loss function values, or if we revisit a state. if ((delta_S_metab_current <= 0.0).all()): end_of_path = t + 1 #stops simulation at step t+1 reached_terminal_state = True final_state = states_matrix[:, t + 1].copy() final_KQ_f = KQ_f_current.copy() final_KQ_r = KQ_r_current.copy() final_reward = epr_path #breakpoint() print( "**************************************Path Length ds<0******************************************" ) print(end_of_path) print("Final STATE") print(states_matrix[:, t + 1]) print(rxn_flux_path) print("original epr") print(epr_path) print("all rewards:") #print(reward_vec[0:t+1]) tau = t - n_back_step + 1 if (tau >= 0): #THIS IS THE FORWARD estimate_value = torch.zeros(1, device=device) for i in range(tau + 1, min(tau + n_back_step, end_of_path) + 1): estimate_value += (gamma**(i - tau - 1)) * reward_vec[i] if ((tau + n_back_step) < end_of_path): value_tau_n = state_value( nn_model, torch.from_numpy( states_matrix[:, tau + n_back_step]).float().to(device)) estimate_value += (gamma**(n_back_step)) * value_tau_n value_tau = state_value( nn_model, torch.from_numpy(states_matrix[:, tau]).float().to(device)) if (value_tau.requires_grad == False): print('value tau broken') if (estimate_value.requires_grad == True): estimate_value.detach_() #THIS IS THE END OF FORWARD #WARNING #loss ordering should be input with requires_grad == True, #followed by target with requires_grad == False optimizer.zero_grad() loss = (loss_fn(value_tau, estimate_value)) #currently MSE loss.backward() clipping_value = 1.0 #torch.nn.utils.clip_grad_value_(nn_model.parameters(), clipping_value) torch.nn.utils.clip_grad_norm_(nn_model.parameters(), clipping_value) optimizer.step() average_loss.append(loss.item()) if (tau >= (end_of_path - 1)): break #after episode is finished, take average loss average_loss_episode = np.mean(average_loss) #print(average_loss) print("index of max error on path") print(average_loss.index(max(average_loss))) #print("All rewards") #print(reward_vec[0:t+1]) return [sum_reward_episode, average_loss_episode,max(average_loss),final_reward, final_state, final_KQ_f,final_KQ_r,\ reached_terminal_state, random_steps_taken,nn_steps_taken]