Python softmax示例

编程语言: Python

命名空间/包名称: utils.helper_func

方法/功能: softmax

hotexamples.com的示例: 7

Python softmax - 已找到7个示例。这些是从开源项目中提取的最受好评的utils.helper_func.softmax现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： bandit_model.py 项目： hanhou/Dynamic-Foraging

    def act_Probabilistic(self):
        
        # !! Should not change q_estimation!! Otherwise will affect following Qs
        # And I put softmax here
        if '_CK' in self.forager:
            self.choice_prob[:, self.time] = softmax(np.vstack([self.q_estimation[:, self.time], self.choice_kernel[:, self.time]]), 
                                                     np.vstack([self.softmax_temperature, self.choice_softmax_temperature]), 
                                                     bias = self.bias_terms)  # Updated softmax function that accepts two elements
        else:
            self.choice_prob[:, self.time] = softmax(self.q_estimation[:, self.time], self.softmax_temperature, bias = self.bias_terms)  

        if self.if_fit_mode:
            self.predictive_choice_prob[:, self.time] = self.choice_prob[:, self.time]
            choice = None   # No need to make specific choice in fitting mode
        else:
            choice = choose_ps(self.choice_prob[:, self.time])
            self.choice_history[0, self.time] = choice
            
        return choice

示例#2

显示文件

def negLL_slide_win(fit_value, *args):
    '''    Negative likelihood function for the sliding window    '''

    # Arguments interpretation
    Q_0, choices, rewards = args
    learn_rate, softmax_temperature, biasL = fit_value
    bias_terms = np.array([biasL, 0])

    trial_n_win = np.shape(choices)[1]
    Q_win = np.zeros_like(rewards)  # K_arm * trial_n
    choice_prob_win = np.zeros_like(rewards)

    # -- Do mini-simulation in this sliding window (light version of RW1972) --
    for t in range(trial_n_win):
        Q_old = Q_0 if t == 0 else Q_win[:, t - 1]

        # Update Q
        choice_this = choices[0, t]
        Q_win[choice_this, t] = Q_old[choice_this] + learn_rate * (
            rewards[choice_this, t] - Q_old[choice_this])  # Chosen side
        Q_win[1 - choice_this, t] = Q_old[1 - choice_this]  # Unchosen side

        # Update choice_prob
        choice_prob_win[:, t] = softmax(Q_win[:, t],
                                        softmax_temperature,
                                        bias=bias_terms)

    # Compute negative likelihood
    likelihood_each_trial = choice_prob_win[choices[
        0, :], range(trial_n_win)]  # Get the actual likelihood for each trial

    # Deal with numerical precision
    likelihood_each_trial[(likelihood_each_trial <= 0) & (
        likelihood_each_trial > -1e-5
    )] = 1e-16  # To avoid infinity, which makes the number of zero likelihoods informative!
    likelihood_each_trial[likelihood_each_trial > 1] = 1

    negLL = -sum(np.log(likelihood_each_trial))

    return negLL

示例#3

显示文件

 def step(self, choice):
        
     # =============================================================================
     #  Generate reward and make the state transition (i.e., prepare reward for the next trial) --
     # =============================================================================
     
     # These four lines work for both varying reward probability and amplitude
     reward = self.reward_available[choice, self.time]    
     self.reward_history[choice, self.time] = reward   # Note that according to Sutton & Barto's convention,
                                                       # this update should belong to time t+1, but here I use t for simplicity.
     reward_available_after_choice = self.reward_available[:, self.time].copy()  # An intermediate reward status. Note the .copy()!
     reward_available_after_choice [choice] = 0   # The reward is depleted at the chosen lick port.
     
     self.time += 1   # Time ticks here !!!
     if self.time == self.n_trials: 
         return;   # Session terminates
     
     if not self.if_varying_amplitude:  # Varying reward prob.
         # For the next reward status, the "or" statement ensures the baiting property, gated by self.if_baited.
         self.reward_available[:, self.time] = np.logical_or( reward_available_after_choice * self.if_baited,    
                                            np.random.uniform(0,1,self.k) < self.p_reward[:,self.time]).astype(int)  
     else:    # Varying reward amplitude
         # For the chosen side AND the unchosen side: 
         # amplitude = 1 - (1 - amp)^(time from last chose)  ==>  next_amp = 1 - (1 - previous_amp) * (1 - p_reward)
         self.reward_available[:, self.time] = 1 - (1 - reward_available_after_choice * self.if_baited) * (1 - self.p_reward[:,self.time])
     
         
     # =============================================================================
     #  Update value estimation (or Poisson choice probability)
     # =============================================================================
     # = Forager types:
     #   1. Special foragers
     #       1). 'Random'
     #       2). 'LossCounting': switch to another option when loss count exceeds a threshold drawn from Gaussian [from Shahidi 2019]
     #           - 3.1: loss_count_threshold = inf --> Always One Side
     #           - 3.2: loss_count_threshold = 1 --> win-stay-lose-switch
     #           - 3.3: loss_count_threshold = 0 --> Always switch
     #       3). 'IdealpGreedy': knows p_reward + always chooses the largest one
     #       4). 'IdealpHatGreedy': knows p_reward AND p_hat + always chooses the largest one p_hat ==> {m,1}, analytical
     #       5). 'IdealpHatOptimal': knows p_reward AND p_hat + always chooses the REAL optimal ==> {m,n}, no analytical solution
     #       6). 'pMatching': to show that pMatching is necessary but not sufficient
     #
     #   2. NLP-like foragers
     #       1). 'Sugrue2004':        income  ->   exp filter   ->  fractional                   -> epsilon-Poisson (epsilon = 0 in their paper; I found it essential)
     #       2). 'Corrado2005':     income  ->  2-exp filter  ->  softmax ( = diff + sigmoid)  -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??)
     #       3). 'Iigaya2019':      income  ->  2-exp filter  ->  fractional                   -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??)
     #
     #   3. RL-like foragers
     #       1). 'SuttonBartoRLBook': return  ->   exp filter                                    -> epsilon-greedy  (epsilon > 0 is essential)
     #       2). 'Bari2019':        return/income  ->   exp filter (both forgetting)   -> softmax     -> epsilon-Poisson (epsilon = 0 in their paper, no necessary)
     #       3). 'Hattori2019':     return/income  ->   exp filter (choice-dependent forgetting, reward-dependent step_size)  -> softmax  -> epsilon-Poisson (epsilon = 0 in their paper; no necessary)
             
     if self.forager in ['LossCounting']:
         if self.loss_count[0, self.time - 1] < 0:  # A switch just happened
             self.loss_count[0, self.time - 1] = - self.loss_count[0, self.time - 1]  # Back to normal (Note that this = 0 in Shahidi 2019)
             if reward:
                 self.loss_count[0, self.time] = 0
             else:
                 self.loss_count[0, self.time] = 1
         else:
             if reward:
                 self.loss_count[0, self.time] = self.loss_count[0, self.time - 1]
             else:
                 self.loss_count[0, self.time] = self.loss_count[0, self.time - 1] + 1
             
     elif self.forager in ['SuttonBartoRLBook', 'Bari2019', 'Hattori2019'] or 'PatternMelioration' in self.forager:
         # Local return
         # Note 1: These three foragers only differ in how they handle step size and forget rate.
         # Note 2: It's "return" rather than "income" because the unchosen Q is not updated (when forget_rate = 0 in SuttonBartoRLBook)
         # Note 3: However, if forget_rate > 0, the unchosen one is also updated, and thus it's somewhere between "return" and "income".
         #         In fact, when step_size = forget_rate, the unchosen Q is updated by exactly the same rule as chosen Q, so it becomes exactly "income"
         
         # 'PatternMelioration' = 'SuttonBartoRLBook'(i.e. RW1972) here because it needs to compute the average RETURN.
         
         # Reward-dependent step size ('Hattori2019')
         if reward:   
             step_size_this = self.step_sizes[1]
         else:
             step_size_this = self.step_sizes[0]
         
         # Choice-dependent forgetting rate ('Hattori2019')
         # Chosen:   Q(n+1) = (1- forget_rate_chosen) * Q(n) + step_size * (Reward - Q(n))
         self.q_estimation[choice, self.time] = (1 - self.forget_rates[1]) * self.q_estimation[choice, self.time - 1]  \
                                          + step_size_this * (reward - self.q_estimation[choice, self.time - 1])
                                              
         # Unchosen: Q(n+1) = (1-forget_rate_unchosen) * Q(n)
         unchosen_idx = [cc for cc in range(self.k) if cc != choice]
         self.q_estimation[unchosen_idx, self.time] = (1 - self.forget_rates[0]) * self.q_estimation[unchosen_idx, self.time - 1] 
         
        
         # Softmax in 'Bari2019', 'Hattori2019'
         if self.forager in ['Bari2019', 'Hattori2019']:
             # --- The below line is erroneous!! Should not change q_estimation!! 04/08/2020 ---
             #     self.q_estimation[:, self.time] = softmax(self.q_estimation[:, self.time], self.softmax_temperature)
             self.choice_prob[:, self.time] = softmax(self.q_estimation[:, self.time], self.softmax_temperature)
         
     elif self.forager in ['Sugrue2004', 'Iigaya2019']:
         # Fractional local income
         # Note: It's "income" because the following computations do not dependent on the current ~choice~.
         
         # 1. Local income = Reward history + exp filter in Sugrue or 2-exp filter in IIgaya
         valid_reward_history = self.reward_history[:, :self.time]   # History till now
         valid_filter = self.history_filter[-self.time:]    # Corresponding filter
         local_income = np.sum(valid_reward_history * valid_filter, axis = 1)
         
         # 2. Poisson choice probability = Fractional local income
         if np.sum(local_income) == 0:
             # 50%-to-50%
             # self.q_estimation[:, self.time] = [1/self.k] * self.k
             self.choice_prob[:, self.time] = [1/self.k] * self.k
         else:
             # Local fractional income
             # self.q_estimation[:, self.time] = local_income / np.sum(local_income)
             self.choice_prob[:, self.time] = local_income / np.sum(local_income)
             
     elif self.forager == 'Corrado2005':
         # Softmaxed local income
         
         # 1. Local income = Reward history + hyperbolic (2-exps) filter
         valid_reward_history = self.reward_history[:, :self.time]   # History till now
         valid_filter = self.history_filter[-self.time:]    # Corresponding filter
         local_income = np.sum(valid_reward_history * valid_filter, axis = 1)
         
         # 2. Poisson choice probability = Softmaxed local income (Note: Equivalent to "difference + sigmoid" in [Corrado etal 2005], for 2lp case)
         # self.q_estimation[:, self.time] = softmax(local_income, self.softmax_temperature)
         self.choice_prob[:, self.time] = softmax(local_income, self.softmax_temperature)
         
     elif 'FullState' in self.forager:
         # print(', rew = ', reward)
         self.full_state_Qforager.update_Q(reward)  # All magics are in the Class definition
         
         if self.if_plot_Q:
             go_on = self.full_state_Qforager.plot_Q(self.time, reward, self.p_reward[:,self.time], self.description);
             if not go_on:  # No longer plot
                 self.if_plot_Q = False
                 
             if self.if_record_Q and self.time == self.n_trials - 1:  # The last frame, stop recording
                 self.full_state_Qforager.writer.cleanup()    
                 self.full_state_Qforager.writer.finish()
         
     return reward

示例#4

显示文件

 def act(self):
     # =============================================================================
     #  Update value estimation (or Poisson choice probability)
     # =============================================================================
     # = Forager types:
     #   1. Special foragers
     #       1). 'Random'
     #       2). 'LossCounting': switch to another option when loss count exceeds a threshold drawn from Gaussian [from Shahidi 2019]
     #           - 3.1: loss_count_threshold = inf --> Always One Side
     #           - 3.2: loss_count_threshold = 1 --> win-stay-lose-switch
     #           - 3.3: loss_count_threshold = 0 --> Always switch
     #       3). 'IdealpGreedy': knows p_reward + always chooses the largest one
     #       4). 'IdealpHatGreedy': knows p_reward AND p_hat + always chooses the largest one p_hat ==> {m,1}, analytical
     #       5). 'IdealpHatOptimal': knows p_reward AND p_hat + always chooses the REAL optimal ==> {m,n}, no analytical solution
     #       6). 'pMatching': to show that pMatching is necessary but not sufficient
     #
     #   2. NLP-like foragers
     #       1). 'Sugrue2004':        income  ->   exp filter   ->  fractional                   -> epsilon-Poisson (epsilon = 0 in their paper; I found it essential)
     #       2). 'Corrado2005':     income  ->  2-exp filter  ->  softmax ( = diff + sigmoid)  -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??)
     #       3). 'Iigaya2019':      income  ->  2-exp filter  ->  fractional                   -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??)
     #
     #   3. RL-like foragers
     #       1). 'SuttonBartoRLBook': return  ->   exp filter                                    -> epsilon-greedy  (epsilon > 0 is essential)
     #       2). 'Bari2019':        return/income  ->   exp filter (both forgetting)   -> softmax     -> epsilon-Poisson (epsilon = 0 in their paper, no necessary)
     #       3). 'Hattori2019':     return/income  ->   exp filter (choice-dependent forgetting, reward-dependent step_size)  -> softmax  -> epsilon-Poisson (epsilon = 0 in their paper; no necessary)
            
     if self.forager == 'Random': 
         choice = np.random.choice(self.k)
         
     elif self.forager == 'AlwaysLEFT':
         choice = LEFT
         
     elif self.forager in ['IdealpHatOptimal','IdealpHatGreedy','AmB1']:   # Foragers that have the pattern {AmBn}
         choice = self.choice_history[0, self.time]  # Already saved in the optimal sequence
         
     elif self.forager == 'pMatching':  # Probability matching of base probabilities p
         choice = choose_ps(self.p_reward[:,self.time])
         
     elif self.forager == 'LossCounting':
         if self.time == 0:
             choice = np.random.choice(self.k)  # Random on the first trial
         else:
             # Retrieve the last choice
             last_choice = self.choice_history[0, self.time - 1]
             
             if self.loss_count[0, self.time] >= self.loss_threshold_this:
                 # Switch
                 choice = LEFT + RIGHT - last_choice
                 
                 # Reset loss counter threshold
                 self.loss_count[0, self.time] = - self.loss_count[0, self.time] # A flag of "switch happens here"
                 self.loss_threshold_this = np.random.normal(self.loss_count_threshold_mean, self.loss_count_threshold_std)
             else:
                 # Stay
                 choice = last_choice
         
     elif self.forager == 'IdealpGreedy':
         choice = np.random.choice(np.where(self.p_reward[:,self.time] == self.p_reward[:,self.time].max())[0])
         
     elif 'PatternMelioration' in self.forager:
         rich_now = np.argmax(self.pattern_now)
         lean_now = 1 - rich_now
         
         if self.run_length_now[rich_now] < self.pattern_now[rich_now]:     # If rich side is not finished
             choice = rich_now  # Make decision
             self.run_length_now[rich_now] += 1 # Update counter
             
         elif self.run_length_now[lean_now] < self.pattern_now[lean_now]:   # If rich has been just finished, run the lean side
             # assert(self.pattern_now[lean_now] == 1)  # Only 1 trial for sure
             choice = lean_now
             self.run_length_now[lean_now] += 1 # Update counter
             
         else:                                                              # Otherwise, this pattern has been finished
             if self.forager == 'PatternMelioration':
                 # Update the next pattern
                 if np.abs(np.diff(self.q_estimation[:, self.time])) >= self.pattern_meliorate_threshold: # Probability of update pattern = Step function
                     rich_Q = np.argmax(self.q_estimation[:, self.time])  # Better side indicated by Q
                     
                     if np.all(self.pattern_now == 1):  # Already in {1,1}
                         self.pattern_now[rich_Q] += 1
                     else:  # Only modify rich side
                         # -- Estimate p_base by Q (no block structure, direct estimation) --  Doesn't work... Sampling the lean side is not efficient
                         # p_base_est_rich = self.q_estimation[rich_now, self.time]
                         # p_base_est_lean = self.q_estimation[lean_now, self.time] / self.pattern_now[rich_Q]
                     
                         # [m, n], _ = self.get_IdealpHatGreedy_strategy([p_base_est_rich, p_base_est_lean])
                         # m = min(m,15)
                         
                         # if p_base_est_rich > p_base_est_lean:  # Don't change side
                         #     self.pattern_now[[rich_now, lean_now]] = [m, 1]
                         # else:
                         #     self.pattern_now[[rich_now, lean_now]] = [1, m]  # Switch side immediately
                         
                         # -- Block-state enables fast switch
                         if rich_Q == rich_now:
                             self.pattern_now[rich_now] += 1 
                         else:  # Maybe this is a block switch, let's try to make some large modification
                             # self.pattern_now = np.flipud(self.pattern_now)  # Flip
                             self.pattern_now = np.array([1,1])  # Reset
                             self.q_estimation[:, self.time] = 0
                                         
                         # -- Not aware of block structure
                         # pattern_step = 1 if (rich_Q == rich_now) else -1   # If the sign of diff_Q is aligned with rich_pattern, then add 1
                         # self.pattern_now[rich_now] += pattern_step                        
             
             elif self.forager == 'PatternMelioration_softmax':
                 # -- Update_step \propto sigmoid --
                 # deltaQ = self.q_estimation[rich_now, self.time] - self.q_estimation[lean_now, self.time]
                 # update_step = int(self.pattern_meliorate_softmax_max_step * 2 * (1 / (1 + np.exp(- deltaQ / self.pattern_meliorate_softmax_temp)) - 0.5))  # Max = 10
                 # self.pattern_now[rich_now] += update_step
                 # if self.pattern_now[rich_now] < 1: 
                 #     self.pattern_now[lean_now] = 2 - self.pattern_now[rich_now]
                 #     self.pattern_now[rich_now] = 1
                 
                 # -- Softmax -> get p -> use {floor(p/(1-p)), 1} --
                 choice_p = softmax(self.q_estimation[:, self.time], self.pattern_meliorate_softmax_temp)
                 rich_Q = np.argmax(choice_p)
                 m_est = np.floor(choice_p[rich_Q] / choice_p[1-rich_Q])
                 m_est = np.min([m_est, 10])
                 self.pattern_now[[rich_Q, 1-rich_Q]] = [m_est, 1]                    
                 
             
             self.run_length_now = np.array([0,0])  # Reset counter
             
             # Make the first trial for the next pattern
             rich_now = np.argmax(self.pattern_now) # Use the new pattern
             choice = rich_now
             self.run_length_now[rich_now] += 1 # Update counter
             
     elif 'FullState' in self.forager:
         if self.time == 0:
             choice = self.full_state_Qforager.current_state.which[0]
         else:
             choice = self.full_state_Qforager.act()  # All magics are in the Class definition
         # print('\nTime = ', self.time, ': ', choice, end='')
             
     else:
         if np.random.rand() < self.epsilon or np.sum(self.reward_history) < self.random_before_total_reward: 
             # Forced exploration with the prob. of epsilon (to avoid AlwaysLEFT/RIGHT in Sugrue2004...)
             choice = np.random.choice(self.k)
             
         else:   # Forager-dependent
             if self.forager == 'SuttonBartoRLBook':   # Greedy
                 choice = np.random.choice(np.where(self.q_estimation[:, self.time] == self.q_estimation[:, self.time].max())[0])
                 
             elif self.forager in ['Sugrue2004', 'Corrado2005', 'Iigaya2019', 'Bari2019', 'Hattori2019' ]:   # Poisson
                 #  choice = choose_ps(self.q_estimation[:,self.time])    
                 choice = choose_ps(self.choice_prob[:,self.time])    
             
     self.choice_history[0, self.time] = int(choice)
     
     return int(choice)

示例#5

显示文件

 def act_softmax(
         self,
         softmax_temp):  # Generate the next action using softmax(Q) policy
     next_state_idx = choose_ps(
         softmax(self.Q[:len(self.next_states)], softmax_temp))
     return next_state_idx  # Return the index of the next state

示例#6

显示文件

    def plot_Q(self,
               time=np.nan,
               reward=np.nan,
               p_reward=np.nan,
               description=''):  # Visualize value functions (Q(s,a))
        # Initialization
        if self.ax == []:
            # Prepare axes
            self.fig, self.ax = plt.subplots(2,
                                             2,
                                             sharey=True,
                                             figsize=[12, 8])
            plt.subplots_adjust(hspace=0.5, top=0.85)
            self.ax2 = self.ax.copy()
            self.annotate = plt.gcf().text(0.05, 0.9, '', fontsize=13)
            for c in [0, 1]:
                for d in [0, 1]:
                    self.ax2[c, d] = self.ax[c, d].twinx()

            # Prepare animation
            if self.if_record_Q:
                metadata = dict(title='FullStateQ', artist='Matplotlib')
                self.writer = FFMpegWriter(fps=25, metadata=metadata)
                self.writer.setup(self.fig,
                                  "..\\results\\%s.mp4" % description, 150)

        direction = ['LEFT', 'RIGHT']
        decision = ['Leave', 'Stay']
        X = np.r_[1:np.shape(self.states)[1] -
                  0.1]  # Ignore the last run_length (Must leave)

        # -- Q values and policy --
        for d in [0, 1]:
            # Compute policy p(a|s)
            if self.if_softmax:
                Qs = np.array([s.Q for s in self.states[d, :-1]])
                ps = []
                for qq in Qs:
                    ps.append(softmax(qq, self.softmax_temperature))
                ps = np.array(ps)

            for c in [0, 1]:
                self.ax[c, d].cla()
                self.ax2[c, d].cla()

                self.ax[c, d].set_xlim([0, max(X) + 1])
                self.ax[c, d].set_ylim([-0.05, max(plt.ylim())])

                bar_color = 'r' if c == 0 else 'g'

                self.ax[c, d].bar(X, Qs[:, c], color=bar_color, alpha=0.5)
                self.ax[c, d].set_title(direction[d] + ', ' + decision[c])
                self.ax[c, d].axhline(0, color='k', ls='--')
                if d == 0: self.ax[c, d].set_ylabel('Q(s,a)', color='k')
                # self.ax[c, d].set_xticks(np.round(self.ax[c, d].get_xticks()))
                self.ax[c, d].set_xticks(X)

                self.ax2[c, d].plot(X, ps[:, c], bar_color + '-o')
                if d == 1: self.ax2[c, d].set_ylabel('P(a|s)', color=bar_color)
                self.ax2[c, d].axhline(0, color=bar_color, ls='--')
                self.ax2[c, d].axhline(1, color=bar_color, ls='--')
                self.ax2[c, d].set_ylim([-0.05, 1.05])

        # -- This state --
        last_state = self.backup_SA[0].which
        current_state = self.current_state.which
        if time > 1:
            self.ax2[0, last_state[0]].plot(last_state[1] + 1,
                                            self.last_reward,
                                            'go',
                                            markersize=10,
                                            alpha=0.5)
        self.ax2[0, current_state[0]].plot(current_state[1] + 1,
                                           reward,
                                           'go',
                                           markersize=15)
        self.last_reward = reward

        # plt.ylim([-1,1])
        self.annotate.set_text(
            '%s\nt = %g, p_reward = %s\n%s --> %s, reward = %g\n' %
            (description, time, p_reward, last_state, current_state, reward))
        if self.if_record_Q:
            print(time)
            self.writer.grab_frame()
            return True
        else:
            plt.gcf().canvas.draw()
            return plt.waitforbuttonpress()

示例#7

显示文件

def fit_dynamic_learning_rate_session_no_bias_free_Q_0(choice_history,
                                                       reward_history,
                                                       slide_win=10,
                                                       pool='',
                                                       x0=[],
                                                       fixed_sigma='none',
                                                       method='DE'):
    ''' 
    Fit R-W 1972 with sliding window = 10 (Wang, ..., Botvinick, 2018) 
    For each sliding window, allows Q_init to be a parameter, no bias term   
    '''

    trial_n = np.shape(choice_history)[1]
    if x0 == []: x0 = [0.4, 0.4, 0.5, 0.5]

    # Settings for RW1972
    # ['RW1972_softmax', ['learn_rate', 'softmax_temperature', 'Q_0'],[0, 1e-2, -5],[1, 15, 5]]

    if fixed_sigma == 'global':
        fit_bounds = [[0, x0[1], 0, 0], [1, x0[1], 1, 1]
                      ]  # Fixed sigma and bias at the global fitted parameters
    elif fixed_sigma == 'zeros':
        fit_bounds = [[0, 1e-4, 0, 0], [1, 1e-4, 1, 1]]
    elif fixed_sigma == 'none':
        fit_bounds = [[0, 1e-2, 0, 0], [1, 15, 1, 1]]

    Q = np.zeros(np.shape(
        reward_history))  # Cache of Q values (using the best fit at each step)
    choice_prob = Q.copy()
    fitted_learn_rate = np.zeros(np.shape(choice_history))
    fitted_sigma = np.zeros(np.shape(choice_history))
    # fitted_Q_0 = np.zeros(np.shape(choice_history))

    for t in tqdm(range(1, trial_n - slide_win),
                  desc='Sliding window',
                  total=trial_n - slide_win):
        # for t in range(1, trial_n - slide_win):  # Start from the second trial
        choice_this = choice_history[:, t:t + slide_win]
        reward_this = reward_history[:, t:t + slide_win]

        if method == 'DE':
            fitting_result = optimize.differential_evolution(
                func=negLL_slide_win_no_bias_free_Q_0,
                args=(choice_this, reward_this),
                bounds=optimize.Bounds(fit_bounds[0], fit_bounds[1]),
                mutation=(0.5, 1),
                recombination=0.7,
                popsize=4,
                strategy='best1bin',
                disp=False,
                workers=1 if pool == '' else
                8,  # For DE, use pool to control if_parallel, although we don't use pool for DE
                updating='immediate' if pool == '' else 'deferred')
        else:
            fitting_result = optimize.minimize(
                negLL_slide_win_no_bias_free_Q_0,
                x0,
                args=(choice_this, reward_this),
                method='L-BFGS-B',
                bounds=optimize.Bounds(fit_bounds[0], fit_bounds[1]))

        # Save parameters
        learn_rate, softmax_temperature, Q_0_L, Q_0_R = fitting_result.x
        fitted_learn_rate[:, t] = learn_rate
        fitted_sigma[:, t] = softmax_temperature
        fitted_Q_0 = np.array([Q_0_L, Q_0_R])

        # Simulate one step to get the first Q from this best fit as the initial value of the next window
        choice_0 = choice_this[0, 0]
        Q[choice_0, t] = fitted_Q_0[choice_0] + learn_rate * (
            reward_this[choice_0, 0] - fitted_Q_0[choice_0])  # Chosen side
        Q[1 - choice_0, t] = fitted_Q_0[1 - choice_0]  # Unchosen side

        choice_prob[:, t] = softmax(
            Q[:, t], softmax_temperature)  # Choice prob (just for validation)

    return fitted_learn_rate, fitted_sigma, fitted_Q_0, Q, choice_prob