def train(self, env, episodes, time_steps): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes)) for i_episode in range(1, episodes + 1): # Generate an episode. # An episode is an array of (state, action, reward) tuples s = env.reset() comounded_decay = 1 for t in range(time_steps): a, log_prob_a = self.get_action(s) ns, r, d, _ = env.step(a) stats.episode_rewards[i_episode - 1] += r stats.episode_lengths[i_episode - 1] = t target = r if not d: target = target + self._gamma * self._V( tt(ns)).cpu().detach() baseline = self._V(tt(s)) advantage = target - baseline comounded_decay *= self._gamma self._train_baseline(target, baseline) self._train_policy(advantage, comounded_decay, log_prob_a) if d: break s = ns print( f"{stats.episode_lengths[i_episode-1]} Steps in Episode {i_episode}/{episodes}. Reward {stats.episode_rewards[i_episode-1]}" ) return stats
def random_next_batch(self, batch_size): batch_indices = np.random.choice(len(self._data.states),batch_size) batch_states = np.array([self._data.states[i] for i in batch_indices]) batch_actions = np.array([self._data.actions[i] for i in batch_indices]) batch_next_state = np.array([self._data.next_states[i] for i in batch_indices]) batch_rewards = np.array([self._data.rewards[i] for i in batch_indices]) batch_terminal_flags = np.array([self._data.terminal_flags[i] for i in batch_indices]) return tt(batch_states), tt(batch_actions), tt(batch_next_state), tt(batch_rewards), tt(batch_terminal_flags)
def Q1(self, s, a): s = tt(s) a = tt(a) if len(s.shape) == 1: x = torch.cat((s, a)) else: x = torch.cat((s, a), dim=1) q1 = self._Q1(x) return q1
def forward(self, x, mu, sigma): x = tt(x) mu = tt(mu) sigma = tt(sigma) self._mu = mu self._sigma = sigma p = 1 / (sigma * np.sqrt(2 * np.pi)) * torch.exp( (-1 / 2) * (torch.div(mu - x, sigma)**2)) return p
def forward(self, x, alpha, beta): x = tt(x) alpha = tt(alpha) beta = tt(beta) self._alpha = alpha self._beta = beta beta_ab = torch.exp((torch.lgamma(alpha) + torch.lgamma(beta) - torch.lgamma(alpha + beta))) p = (torch.pow(x, alpha - 1) * torch.pow(1 - x, beta - 1)) / beta_ab return p
def mode(self): alpha = self._alpha.detach().numpy() beta = self._beta.detach().numpy() mode = np.zeros(alpha.shape[0]) indices = np.arange(0, mode.shape[0]) idx = indices[(alpha > 1) & (beta > 1)] mode[idx] = (alpha[idx] - 1) / (alpha[idx] + beta[idx] - 2) # Uniform idx = indices[(alpha == 1) & (beta == 1)] mode[idx] = np.random.uniform(0, 1, len(idx)) # Bi-Modal idx = indices[(alpha < 1) & (beta < 1)] mode[idx] = np.random.choice([0, 1], len(idx)) idx = indices[(alpha <= 1) & (beta > 1)] mode[idx] = 0 idx = indices[(alpha > 1) & (beta <= 1)] mode[idx] = 1 return tt(mode)
def update(self, X_batch, y_batch): self.optimizer.zero_grad() y_batch_pred = self.net(tt(X_batch)) loss = self.criterion(y_batch_pred, _y(y_batch)) loss.backward() self.optimizer.step() return loss
def list_task(bdstoken): url = CLOUD_DL + '?bdstoken=' + bdstoken + \ '&need_task_info=1&status=255&start=0&limit=100&method=list_task&app_id=250528&t=' + \ utils.tt() + '&bdstoken=' + bdstoken + '&channel=chunlei&clienttype=0&web=1&app_id=250528' xml = fetch(url, {}, utils.myname(), {}) j = json.loads(xml.decode("utf-8")) return j
def query_task(bdstoken, taskid): url = CLOUD_DL + "?bdstoken=" + bdstoken + "&task_ids=" + \ taskid + "&op_type=1&method=query_task&app_id=250528&t=" + utils.tt() + \ "&bdstoken=" + bdstoken + "&channel=chunlei&clienttype=0&web=1&app_id=250528" xml = fetch(url, {}, utils.myname(), {}) j = json.loads(xml.decode("utf-8")) logger.debug("json: %s " % str(j)) return (j, taskid)
def list_task(bdstoken): url = CLOUD_DL + '?bdstoken=' + bdstoken + \ '&need_task_info=1&status=255&start=0&limit=100&method=list_task&app_id=250528&t=' + \ utils.tt() + '&bdstoken=' + bdstoken + '&channel=chunlei&clienttype=0&web=1&app_id=250528' xml = fetch(url,{},utils.myname(),{}) j = json.loads(xml.decode("utf-8")) return j
def forward(self, x): if not isinstance(x, torch.Tensor): x = tt(x) x = self._fc1(x) return x
def query_task(bdstoken,taskid): url = CLOUD_DL + "?bdstoken=" + bdstoken + "&task_ids=" + \ taskid + "&op_type=1&method=query_task&app_id=250528&t=" + utils.tt() + \ "&bdstoken=" + bdstoken + "&channel=chunlei&clienttype=0&web=1&app_id=250528" xml = fetch(url,{},utils.myname(),{}) j = json.loads(xml.decode("utf-8")) logger.debug("json: %s "% str(j)) return (j,taskid)
def get_action(self, s): mu_action = self._pi(tt(s)) # mu_action = self._pi(tt(s)).detach().numpy() action_sampled = np.random.normal(loc=mu_action.detach().numpy(), scale=0.1, size=1) action_sampled = np.clip(action_sampled, a_min=-1.0, a_max=1.0) log_prob = torch.log(mu_action + torch.normal(mean=mu_action)) return action_sampled, log_prob
def login_check(username, token): header = { 'Host': PASSPORT_HOST, 'Referer': PAN_INDEX, } cbs = utils.cbs_token() url = PASSPORT_API + '/?loginhistory&token=' + token + \ '&tpl=netdisk&apiver=v3&tt=' + utils.tt() + '&username='******'&isphone=false&callback=' + cbs xml = fetch(url, {}, utils.myname(), header) xml = utils.fix_json(xml) return xml
def get_action(self, s): probs = self._pi(tt(s)) action = np.random.choice(a=self._action_dim, p=np.squeeze(probs.detach().numpy())) log_prob = torch.log(probs.squeeze(0)[action]) # converting the discrete action [0,1,2,...] # to an action in the continuous # range (actionspace.low <--> actionspace.high) if self.d2c: action = self.d2c(action) return action, log_prob
def get_token(): header = { 'Host': PASSPORT_HOST, 'Referer': PAN_INDEX, } cbs = utils.cbs_token() login_init = utils.tt() url = PASSPORT_API + '/?getapi&tpl=netdisk&apiver=v3&tt='+ \ login_init + '&class=login&logintype=basicLogin&callback=' + cbs logger.debug('url:: %s ' % url) xml = fetch(url, {}, utils.myname(), header) xml = utils.fix_json(xml.decode('utf-8')) token = json.loads(xml)['data']['token'] logger.debug("token:%s" % token) return token
def forward(self, x): if not isinstance(x, torch.Tensor): x = tt(x) for i in range(len(self.layers) - 1): x = self.layers[i](x) if self._hidden_non_linearity is not None: x = self._hidden_non_linearity(x) x = self.layers[-1](x) if self._output_non_linearity is not None: x = self._output_non_linearity(x) return x
def login(rsakey, pubkey, username, password, token): url = PASSPORT_API + '/?login' login_start = utils.tt() header = { 'Host': PASSPORT_HOST, 'Referer': PAN_INDEX, 'Origin': PAN_INDEX, 'Content-Type': 'application/x-www-form-urlencoded', } logger.debug("encrypted pw: %s " % RSA_encrypt(pubkey, password)) post = { 'apiver': 'v3', 'callback': 'parent.' + utils.cbs_token(), 'charset': 'utf-8', 'codestring': '', 'isPhone': 'false', 'loginmerge': 'true', 'logintype': 'basicLogin', 'mem_pass': '******', 'password': RSA_encrypt(pubkey, password), #password, 'ppui_logintime': str(random.randint(52000, 58535)), #int(login_start)-int(login_init), 'quick_user': '******', 'safeflg': '0', 'staticpage': 'http://pan.baidu.com/res/static/thirdparty/pass_v3_jump.html', 'token': token, 'tpl': 'netdisk', 'tt': login_start, 'u': PAN_INDEX, 'username': username, 'verifycode': '', 'subpro': '', 'logLoginType': 'pc_loginBasic', 'crypttype': '12', 'rsakey': rsakey, 'idc:': '', } xml = fetch(url, post, utils.myname(), header).decode('utf-8') img = re.search('"(err_no=[^"]*)"', xml).group(1) import urllib.parse idict = dict(urllib.parse.parse_qsl(img)) logger.debug("idict : %s" % idict) return (xml, idict)
def get_public_key(token): header = { 'Host': PASSPORT_HOST, 'Referer': PAN_INDEX, } cbs = utils.cbs_token() url = 'https://passport.baidu.com/v2/getpublickey?token=' + \ token + '&tpl=netdisk&apiver=v3&tt=' + utils.tt() + '&callback=' + cbs xml = fetch(url, {}, utils.myname(), header).decode('utf-8') keystr = re.search("\(([^\)]*)\)", xml).group(1).replace("'", '"').replace('\t', '') logger.debug("key str:%s" % keystr) keydict = eval(keystr) logger.debug("keydict:%s" % keydict) rsakey = keydict['key'] pubkey = keydict['pubkey'] logger.debug("rsakey:%s" % rsakey) logger.debug("pubkey:%s" % pubkey) return (rsakey, pubkey)
def train(self, env, episodes, time_steps): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes)) for i_episode in range(1, episodes + 1): # Generate an episode. # An episode is an array of (state, action, reward) tuples episode = [] s = env.reset() for t in range(time_steps): a, log_prob_a = self.get_action(s) ns, r, d, _ = env.step(a) stats.episode_rewards[i_episode - 1] += r stats.episode_lengths[i_episode - 1] = t episode.append((s, a, log_prob_a, r)) if d: break s = ns # collect all rewards at one place T = len(episode) G = 0.0 for t in reversed(range(T)): s, a, log_prob, r = episode[t] G = self._gamma * G + r baseline = self._V(tt(s)) advantage = G - baseline self._train_baseline(G, baseline) self._train_policy(advantage, t, log_prob) print("\r{} Steps in Episode {}/{}. Reward {}".format( len(episode), i_episode, episodes, sum([e[3] for i, e in enumerate(episode)]))) return stats
def list_path(path,num,dry,bdstoken): logger.info("Listing path %s."%path) settings.DRY = dry header = { 'Host':PAN_HOST, 'Referer':DISK_HOME, } t = utils.tt() t2 = str(int(t) + 2) if path: _path = urllib.parse.urlencode({"dir":path}) else: _path = urllib.parse.urlencode({"dir":'/'}) url = PAN_INDEX + '/api/list?channel=chunlei&clienttype=0&web=1&num=' + \ str(num) + '&t=' + t + '&page=1&' + _path + \ '&showempty=0&order=time&desc=1&_='+ t2 + \ '&bdstoken=' + bdstoken + "&app_id=250528" xml = fetch(url,{},utils.myname(),header,path) list_json = json.loads(xml.decode("utf-8")) if list_json: return list_json else: return None
def table_for_robustness(robustness_measure): global datasets data = df[df['robustness'] == robustness_measure] # each count should be 1 # data.groupby(["metric", "dataset"])['value'].count() pivot = data.pivot_table(values="value", index="metric", columns="dataset", aggfunc="first") \ .rename_axis(None) pivot.columns = pivot.columns.astype(list) pivot = pivot.reset_index() # .rename({"index": }, axis=1) column_format = "|p{40mm}|" + "|".join( "c" * (len(datasets)) for experiment, datasets in experiment_datasets.items()) + "|" float_formatter = ffloat if robustness_measure != "RankInstability" else fffloat latex = pivot.to_latex( escape=False, index=False, # index_names=False, caption=robustness_measure + " of " + str(len(metrics)) + " metrics on " + str(len(datasets)) + " datasets (" + experiments_str + ")", label="tab:robustness-" + robustness_measure[4:].lower(), column_format=column_format, header=[small(bf(robustness_measure))] + [tiny(tt(col)) for col in datasets], formatters=[lambda v: small(v)] + [lambda v: small(ffloat(v))] * len(datasets), ) latex = modify_tabular(latex, in_table=False, prefix="\scalebox{1}{\n", postfix="\n}") return latex
def list_path(path, num, dry, bdstoken): logger.info("Listing path %s." % path) settings.DRY = dry header = { 'Host': PAN_HOST, 'Referer': DISK_HOME, } t = utils.tt() t2 = str(int(t) + 2) if path: _path = urllib.parse.urlencode({"dir": path}) else: _path = urllib.parse.urlencode({"dir": '/'}) url = PAN_INDEX + '/api/list?channel=chunlei&clienttype=0&web=1&num=' + \ str(num) + '&t=' + t + '&page=1&' + _path + \ '&showempty=0&order=time&desc=1&_='+ t2 + \ '&bdstoken=' + bdstoken + "&app_id=250528" xml = fetch(url, {}, utils.myname(), header, path) list_json = json.loads(xml.decode("utf-8")) if list_json: return list_json else: return None
else: return ("%.0f" % n) + " " + name + "s" return ", ".join(filter(None, [comp(days, "day"), comp(hours, "hour"), comp(minutes, "min")])) df = pd.read_excel("perf_experiments.xlsx") \ .rename({"experiment": "Experiment"}, axis=1) df['Total CPU time'] = df['time_user'].apply(format_duration) df['n_graphs_total'] = df['n_datasets'] * df['n_graphs'] df['Avg CPU time per graph'] = (df['time_user'] / df['n_graphs_total']).apply(format_duration) cols = ['Experiment', 'Total CPU time', 'Avg CPU time per graph'] df = df[cols] df['Experiment'] = df['Experiment'].apply(lambda e: tt(e)) # %% with open("perf_experiments_table.tex", "w") as f: f.write("") latex = df.to_latex( index=False, escape=False, column_format="|l|r|r|", caption="CPU Computation time of the 3 experiments evaluated by \\graffs, run on the \\texttt{rio} computing cluster (see \\autoref{sec:computing_cluster}).\n" "\\textsl{Total CPU time} is the sum of all times of individual CPU cores spent on evaluating the experiment, " "and \\textsl{Avg CPU time per graph} is that divided by $(\\text{number of datasets}) \\times (\\text{number of graphs generated from each dataset})$.", label="tab:perf_experiments_table", ) latex = modify_tabular(latex, prefix="\scalebox{0.8}{\n", postfix="\n}")
def train(self): for i in range(self.n_episodes): state = self.env.reset() for step in range(self.time_steps): if self.render: self.env.render() state = tt(state) action = self.actor(state).cpu().detach().numpy() noise = np.random.normal(0, 0.1, size=self.env.action_space.shape[0]) action = np.clip(action + noise, self.env.action_space.low[0], self.env.action_space.high[0]) next_state, reward, done, _ = self.env.step(action) # Save step in memory self.replay_memory.append(state=state, action=action, reward=reward, next_state=next_state, done=done) res = { 'episodes': i + 1, 'states': state.tolist(), 'rewards': reward, 'steps': step + 1 } # Start training, if batch size reached if len(self.replay_memory) < self.batch_size: self.res = self.res.append([res]) continue # Sample batch from memory states, actions, rewards, next_states, dones = self.replay_memory.sample_batch( ) # Critic loss q1, q2 = self.critic(states, actions) next_actions = self.actor_target(next_states) noise = tt(torch.Tensor(actions.cpu()).data.normal_(0, 0.2)) noise = noise.clamp(-0.5, 0.5) next_actions = (next_actions + noise).clamp( self.env.action_space.low[0], self.env.action_space.high[0]) # Get next state q values by Clipped Double Q-Learning q1_ns, q2_ns = self.critic_target(next_states, next_actions.detach()) q_ns = torch.min(q1_ns, q2_ns) td_target = rewards + self.gamma * q_ns loss_critic = self.critic_loss_fct( q1, td_target) + self.critic_loss_fct(q2, td_target) res['critic_losses'] = float(loss_critic) # Optimize critic self.critic_optim.zero_grad() loss_critic.backward() self.critic_optim.step() # Delayed Policy Updates if step % self.pi_update_steps == 0: q1, _ = self.critic(states, self.actor(states)) # Actor loss loss_actor = -q1.mean() res['actor_losses'] = float(loss_actor) # Optimize actor self.actor_optim.zero_grad() loss_actor.backward() self.actor_optim.step() # update target networks for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip( self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.res = self.res.append([res]) state = next_state if done: break logging.info(f'Episode {i + 1}:') logging.info( f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}' ) logging.info( f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}' ) self.env.close() return self.res
def add_task(bdstoken, t_url, save_path, dia): header = { 'Content-Type': 'application/x-www-form-urlencoded', 'Host': PAN_HOST, 'Referer': DISK_HOME, } url = CLOUD_DL + '?bdstoken=' + bdstoken + '&channel=chunlei&clienttype=0&web=1' post = { 'method': 'add_task', 'app_id': '250528', 'source_url': t_url, 'save_path': save_path, 'type': '3', } xml = fetch(url, post, utils.myname(), header, save_path) j = json.loads(xml.decode("utf-8")) logger.debug("json: %s " % str(j)) if 'error_code' in list(j.keys()): logger.info(j['error_msg']) if j['error_code'] != 36022: while 'vcode' in list(j.keys()): vcode = j['vcode'] logger.info(vcode) imgurl = j['img'] #f=open(vimg,"wb") #fp = fetch(imgurl,{},"Input Vcode") #f.write(fp) #f.close() #try: # subprocess.Popen(['xdg-open', vimg]) #except: # print("please open file %s to check the vcode."%vimg) #mag = re.search('(&.*$)',t_url).group(1) #task_name = dict(urllib.parse.parse_qsl(mag))['dn'] #logger.info("Please input vcode for task: %s ."%(task_name)) vd = VcodeDialog(dia, imgurl) vd.new_url(imgurl) response = vd.run() print(response) if response == 22: print("The OK button was clicked") vf = vd.get_user_input() vd.destroy() elif response == Gtk.ResponseType.DELETE_EVENT: vd.destroy() #input("verification code # ").strip() add = { 'file_sha1': '', 'selected_idx': '1,2,3,4', 'task_from': '0', 't': utils.tt(), 'type': 4, 'input': vf, 'vcode': vcode, } print(add) post.update(add) xml = fetch(url, post, "TryWithVcode", header, save_path) j = json.loads(xml.decode("utf-8")) logger.debug("json: %s " % str(j)) if 'error_code' in list(j.keys()): logger.info(j['error_msg']) return j else: return j['error_msg'] logger.debug("json: %s " % str(j)) return j
def _train_baseline(self, G, baseline): self._V_optimizer.zero_grad() loss = self._loss_function(tt(np.array([G])), baseline) loss.backward(retain_graph=True) self._V_optimizer.step()
def add_task(bdstoken,t_url,save_path,dia): header = { 'Content-Type':'application/x-www-form-urlencoded', 'Host':PAN_HOST, 'Referer':DISK_HOME, } url = CLOUD_DL + '?bdstoken=' + bdstoken + '&channel=chunlei&clienttype=0&web=1' post = { 'method':'add_task', 'app_id':'250528', 'source_url':t_url, 'save_path':save_path, 'type':'3', } xml = fetch(url,post,utils.myname(),header,save_path) j = json.loads(xml.decode("utf-8")) logger.debug("json: %s "% str(j)) if 'error_code' in list(j.keys()): logger.info(j['error_msg']) if j['error_code'] != 36022 : while 'vcode' in list(j.keys()): vcode = j['vcode'] logger.info(vcode) imgurl = j['img'] #f=open(vimg,"wb") #fp = fetch(imgurl,{},"Input Vcode") #f.write(fp) #f.close() #try: # subprocess.Popen(['xdg-open', vimg]) #except: # print("please open file %s to check the vcode."%vimg) #mag = re.search('(&.*$)',t_url).group(1) #task_name = dict(urllib.parse.parse_qsl(mag))['dn'] #logger.info("Please input vcode for task: %s ."%(task_name)) vd = VcodeDialog(dia,imgurl) vd.new_url(imgurl) response = vd.run() print(response) if response == 22: print("The OK button was clicked") vf = vd.get_user_input() vd.destroy() elif response == Gtk.ResponseType.DELETE_EVENT: vd.destroy() #input("verification code # ").strip() add = { 'file_sha1':'', 'selected_idx':'1,2,3,4', 'task_from':'0', 't':utils.tt(), 'type':4, 'input':vf, 'vcode':vcode, } print(add) post.update(add) xml = fetch(url,post,"TryWithVcode",header,save_path) j = json.loads(xml.decode("utf-8")) logger.debug("json: %s "% str(j)) if 'error_code' in list(j.keys()): logger.info(j['error_msg']) return j else: return j['error_msg'] logger.debug("json: %s "% str(j)) return j
def predict(self, X): output = self.net(tt(X)).detach() return output
def get_action(self, x, epsilon): u = np.argmax(self._q(tt(x)).cpu().detach().numpy()) r = np.random.uniform() if r < epsilon: return np.random.randint(self._action_dim) return u
def train(self): for i in range(self.n_episodes): steps = 0 state = self.env.reset() for step in range(self.time_steps): if self.render: self.env.render() state = tt(state) action = self.actor(state).detach().numpy() # Exploration p = np.random.random() if p < self.eps: action = np.random.uniform(low=-1, high=1, size=(1, )) # Do one step in env next_state, reward, done, _ = self.env.step(action) res = { 'episodes': i + 1, 'states': state.tolist(), 'rewards': reward, 'steps': step + 1 } # Save step in memory self.replay_memory.append(state=state, action=action, reward=reward, next_state=next_state, done=done) # Start training, if batch size reached if len(self.replay_memory) < self.batch_size: continue # Sample batch from memory states, actions, rewards, next_states, dones = self.replay_memory.sample_batch( ) # Critic loss q_values = self.critic(states, actions) next_actions = self.actor_target(next_states) q_values_ns = self.critic_target(next_states, next_actions.detach()) td_target = rewards + self.gamma * q_values_ns loss_critic = self.critic_loss_fct(q_values, td_target) # Actor loss loss_actor = -(self.critic(states, self.actor(states)).mean()) # Optimize actor self.actor_optim.zero_grad() loss_actor.backward() self.actor_optim.step() # Optimize critic self.critic_optim.zero_grad() loss_critic.backward() self.critic_optim.step() # update target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) self.res = self.res.append([res]) state = next_state steps += 1 if done: break logging.info(f'Episode {i + 1}:') logging.info( f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}' ) logging.info( f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}' ) self.env.close() return self.res
def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): # Generate an episode. # An episode is an array of (state, action, reward) tuples episode = [] s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) total_r = 0 for t in range(time_steps): a = self._get_action(s) ns, r, d, _ = env.step(tn(self._action_fun.act2env(a))) stats.episode_rewards[e] += r stats.episode_lengths[e] = t episode.append((s, a, r)) total_r += r if d: break s = ns gamma_t = 1 for t in range(len(episode)): # Find the first occurrence of the state in the episode s, a, r = episode[t] g = 0 gamma_kt = 1 for k in range(t, len(episode)): gamma_kt = gamma_kt * self._gamma _, _, r_k = episode[k] g = g + (gamma_kt * r_k) g = float(g) p = self._pi(s, a) # For Numerical Stability, in order to not get probabilities higher than one (e.g. delta distribution) # and to not return a probability equal to 0 because of the log in the score_function eps = 1e-8 p = p.clamp(eps, 1) log_p = torch.log(p) gamma_t = gamma_t * self._gamma if self._baseline: bl = self.baseline_fun(s) delta = g - bl bl_loss = self._bl_loss_function(self.baseline_fun(s), tt([g])) self._bl_optimizer.zero_grad() bl_loss.backward() self._bl_optimizer.step() score_fun = torch.mean(-(gamma_t * delta) * log_p) else: score_fun = torch.mean(-(gamma_t * g) * log_p) stats.episode_loss[e] += score_fun.item() self._pi_optimizer.zero_grad() score_fun.backward() self._pi_optimizer.step() pr_stats = { 'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e] } print_stats(pr_stats) return stats