sigma_average_dict = defaultdict(list) components = ['W: first hidden', 'b: first hidden', 'W: second hidden', 'b: second hidden', \ 'W: output','b: output'] for i_episode in range(num_episodes): # Initialize the environment and state env.reset() state = Tensor(env.get_state()).unsqueeze(0) score = 0 for t in xrange(500): # Select and perform an action if t % sample_period == 0: w_sample = model.sample() action = select_action(state) reward, done = env.do_action(action[0, 0]) score += reward reward = Tensor([reward]) # Observe new state if not done: next_state = Tensor(env.get_state()).unsqueeze(0) else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state if done: