def __init__(self, config, environment): super(DQNAgent, self).__init__(config, environment) #self.history = History(config) self.replay_memory = DQNReplayMemory(config) self.net = DQN(len(environment.n_actions), config) self.net.build() self.net.add_summary([ "average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate" ], ["ep_rewards", "ep_actions"]) self.account_profit_loss = 0. self.forecast_window = config.forecast_window
def __init__(self, config, environment): super(QAgent, self).__init__(config,environment) #self.history = History(config) # self.replay_memory = DQNReplayMemory(config) self.net = DQN(len(environment.n_actions), config) self.net.build() self.net.add_summary(["average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate"], ["ep_rewards", "ep_actions"]) self.account_profit_loss = 0. self.forecast_window=config.forecast_window self.close_attempts = 0 self.q_learning_rate=0.01 self.policy = self.make_epsilon_greedy_policy(self.QFunc, self.epsilon, len(self.env.n_actions))
def __init__(self, epsilon=0.85, epsilon_increment=0, epsilon_max=0.85, discount=0.95, network=NetworkTypes.DRQN, layers=[256, 128], learning_rate=1e-3, replace_target_iter=2000, batch_size=100): self.name = 'QAgent' self.n_actions = 3 self.n_features = 70 self.epsilon_max = epsilon_max self.epsilon = epsilon self.epsilon_backup = epsilon self.epsilon_increment = epsilon_increment self.last_state = None self.action = None self.reward = None self.state = None self.terminal = None self.network = network # create q learning algorithm if network == NetworkTypes.DQN: self.q_learning = DQN(self.n_actions, self.n_features, layers, learning_rate, batch_size, replace_target_iter, discount) elif network == NetworkTypes.DRQN: self.q_learning = DRQN(self.n_actions, self.n_features, layers, learning_rate, batch_size, replace_target_iter, discount) else: raise ValueError("Not implemented type of network passed to QAgent")
class MDP: def __init__(self, args): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] # 'open', 'close'] self.P_START = 0.999 self.P_END = 0.05 self.P_DECAY = 500 self.max_iter = args.max_iter self.gripping_force = args.grip_force self.breaking_threshold = args.break_thresh # Prepare the drawing figure fig, (ax1, ax2) = plt.subplots(1, 2) self.figure = (fig, ax1, ax2) # Function to select an action from our policy or a random one def select_action(self, state): sample = random.random() p_threshold = self.P_END + (self.P_START - self.P_END) * math.exp( -1. * self.steps_done / self.P_DECAY) self.steps_done += 1 if sample > p_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. self.policy_net_1.eval() torch_state = torch.from_numpy(state).float().to( self.args.device) action = self.policy_net_1(torch_state.unsqueeze(0)).max(1)[1] self.policy_net_1.train() return action.item() else: return random.randrange(self.args.outdim) def optimize_model(self): args = self.args if len(self.memory) < args.batch_size: return transitions = self.memory.sample(args.batch_size) state_batch, action_batch, reward_batch, nextstate_batch = [], [], [], [] for transition in transitions: state_batch.append(transition.state) action_batch.append(transition.action) reward_batch.append(transition.reward) nextstate_batch.append(transition.next_state) state_batch = torch.from_numpy(np.array(state_batch)).float().to( args.device) action_batch = torch.from_numpy(np.array(action_batch)).to( args.device).unsqueeze(1) reward_batch = torch.from_numpy(np.array(reward_batch)).float().to( args.device).unsqueeze(1) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, nextstate_batch)), device=args.device, dtype=torch.bool).unsqueeze(1) non_final_next_states = torch.cat([ torch.from_numpy(s).float().to(args.device).unsqueeze(0) for s in nextstate_batch if s is not None ]) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values_1 = self.policy_net_1(state_batch).gather( 1, action_batch) state_action_values_2 = self.policy_net_2(state_batch).gather( 1, action_batch) state_action_values_3 = self.policy_net_3(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values_1 = torch.zeros((args.batch_size, 1), device=args.device) next_state_values_2 = torch.zeros((args.batch_size, 1), device=args.device) next_state_values_3 = torch.zeros((args.batch_size, 1), device=args.device) next_state_values_1[non_final_mask] = self.policy_net_1( non_final_next_states).max(1)[0].detach() next_state_values_2[non_final_mask] = self.policy_net_2( non_final_next_states).max(1)[0].detach() next_state_values_3[non_final_mask] = self.policy_net_3( non_final_next_states).max(1)[0].detach() next_state_values = torch.min( torch.min(next_state_values_1, next_state_values_2), next_state_values_3) # Compute the expected Q values expected_state_action_values = (next_state_values * args.gamma) + reward_batch # Compute Huber loss loss_1 = F.smooth_l1_loss(state_action_values_1, expected_state_action_values) loss_2 = F.smooth_l1_loss(state_action_values_2, expected_state_action_values) loss_3 = F.smooth_l1_loss(state_action_values_3, expected_state_action_values) # Optimize the model self.optimizer_1.zero_grad() self.optimizer_2.zero_grad() self.optimizer_3.zero_grad() loss_1.backward() loss_2.backward() loss_3.backward() for param in self.policy_net_1.parameters(): param.grad.data.clamp_(-1, 1) for param in self.policy_net_2.parameters(): param.grad.data.clamp_(-1, 1) for param in self.policy_net_3.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_1.step() self.optimizer_2.step() self.optimizer_3.step() return [loss_1, loss_2, loss_3] def train_MDP(self): args = self.args # Create the output directory if it does not exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # Create our policy net and a target net self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device) self.target_net = DQN(args.indim, args.outdim).to(args.device) self.target_net.load_state_dict(self.policy_net_1.state_dict()) self.target_net.eval() # Set up the optimizer self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(), args.lr) self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(), args.lr) self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(), args.lr) self.memory = ReplayMemory(500000) self.steps_done = 0 # Setup the state normalizer normalizer = Normalizer(args.indim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} # Load old checkpoint if provided start_episode = 0 if args.checkpoint_file: if os.path.exists(args.checkpoint_file): checkpoint = torch.load(args.checkpoint_file) self.policy_net_1.load_state_dict( checkpoint['model_state_dict']) self.policy_net_2.load_state_dict( checkpoint['model_state_dict']) self.policy_net_3.load_state_dict( checkpoint['model_state_dict']) self.target_net.load_state_dict(checkpoint['model_state_dict']) start_episode = checkpoint['epoch'] self.steps_done = start_episode self.optimizer_1.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_2.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_3.load_state_dict( checkpoint['optimizer_state_dict']) with open( os.path.join(os.path.dirname(args.checkpoint_file), 'results_geom_mdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) action_space = ActionSpace(dp=0.06, df=10) # Main training loop for ii in range(start_episode, args.epochs): start_time = time.time() if args.sim: # Create robot, reset simulation and grasp handle model, model_params = init_model(args.model_path) sim = MjSim(model) sim.step() viewer = None if args.render: viewer = MjViewer(sim) else: viewer = None sim_param = SimParameter(sim) robot = RobotSim(sim, viewer, sim_param, args.render, self.breaking_threshold) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Get current state state_space = Observation( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) broken_so_far = 0 for t in count(): if not args.quiet and t % 20 == 0: print("Running training episode: {}, iteration: {}".format( ii, t)) # Select action state = state_space.get_state() if args.position: state = state[6:] if args.shear: indices = np.ones(len(state), dtype=bool) indices[6:166] = False state = state[indices] if args.force: state = state[:166] normalizer.observe(state) state = normalizer.normalize(state) action = self.select_action(state) # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(state_space.get_current_position(), np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) if args.sim: robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: reward = 0 # # Add a movement reward # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta) # Observe new state state_space.update( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) # Set max number of iterations if t >= self.max_iter: done = True # Check if done if not done and not failure: next_state = state_space.get_state() if args.position: next_state = next_state[6:] if args.shear: indices = np.ones(len(next_state), dtype=bool) indices[6:166] = False next_state = next_state[indices] if args.force: next_state = next_state[:166] normalizer.observe(next_state) next_state = normalizer.normalize(next_state) else: next_state = None # Push new Transition into memory self.memory.push(state, action, next_state, reward) # Optimize the model loss = self.optimize_model() # if loss: # print_variables['loss'].append(loss.item()) # If we are done, reset the model if done or failure: if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, 'Training MDP') print("Model parameters: {}".format(model_params)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Update the target network, every x iterations if ii % 10 == 0: self.target_net.load_state_dict(self.policy_net_1.state_dict()) # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join( args.output_dir, 'checkpoint_model_' + str(ii) + '.pth') torch.save( { 'epoch': ii, 'model_state_dict': self.target_net.state_dict(), 'optimizer_state_dict': self.optimizer_1.state_dict(), }, save_path) # Save normalizer state for inference normalizer.save_state( os.path.join(args.output_dir, 'normalizer_state.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables
def train_MDP(self): args = self.args # Create the output directory if it does not exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # Create our policy net and a target net self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device) self.target_net = DQN(args.indim, args.outdim).to(args.device) self.target_net.load_state_dict(self.policy_net_1.state_dict()) self.target_net.eval() # Set up the optimizer self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(), args.lr) self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(), args.lr) self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(), args.lr) self.memory = ReplayMemory(500000) self.steps_done = 0 # Setup the state normalizer normalizer = Normalizer(args.indim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} # Load old checkpoint if provided start_episode = 0 if args.checkpoint_file: if os.path.exists(args.checkpoint_file): checkpoint = torch.load(args.checkpoint_file) self.policy_net_1.load_state_dict( checkpoint['model_state_dict']) self.policy_net_2.load_state_dict( checkpoint['model_state_dict']) self.policy_net_3.load_state_dict( checkpoint['model_state_dict']) self.target_net.load_state_dict(checkpoint['model_state_dict']) start_episode = checkpoint['epoch'] self.steps_done = start_episode self.optimizer_1.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_2.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_3.load_state_dict( checkpoint['optimizer_state_dict']) with open( os.path.join(os.path.dirname(args.checkpoint_file), 'results_geom_mdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) action_space = ActionSpace(dp=0.06, df=10) # Main training loop for ii in range(start_episode, args.epochs): start_time = time.time() if args.sim: # Create robot, reset simulation and grasp handle model, model_params = init_model(args.model_path) sim = MjSim(model) sim.step() viewer = None if args.render: viewer = MjViewer(sim) else: viewer = None sim_param = SimParameter(sim) robot = RobotSim(sim, viewer, sim_param, args.render, self.breaking_threshold) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Get current state state_space = Observation( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) broken_so_far = 0 for t in count(): if not args.quiet and t % 20 == 0: print("Running training episode: {}, iteration: {}".format( ii, t)) # Select action state = state_space.get_state() if args.position: state = state[6:] if args.shear: indices = np.ones(len(state), dtype=bool) indices[6:166] = False state = state[indices] if args.force: state = state[:166] normalizer.observe(state) state = normalizer.normalize(state) action = self.select_action(state) # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(state_space.get_current_position(), np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) if args.sim: robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: reward = 0 # # Add a movement reward # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta) # Observe new state state_space.update( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) # Set max number of iterations if t >= self.max_iter: done = True # Check if done if not done and not failure: next_state = state_space.get_state() if args.position: next_state = next_state[6:] if args.shear: indices = np.ones(len(next_state), dtype=bool) indices[6:166] = False next_state = next_state[indices] if args.force: next_state = next_state[:166] normalizer.observe(next_state) next_state = normalizer.normalize(next_state) else: next_state = None # Push new Transition into memory self.memory.push(state, action, next_state, reward) # Optimize the model loss = self.optimize_model() # if loss: # print_variables['loss'].append(loss.item()) # If we are done, reset the model if done or failure: if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, 'Training MDP') print("Model parameters: {}".format(model_params)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Update the target network, every x iterations if ii % 10 == 0: self.target_net.load_state_dict(self.policy_net_1.state_dict()) # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join( args.output_dir, 'checkpoint_model_' + str(ii) + '.pth') torch.save( { 'epoch': ii, 'model_state_dict': self.target_net.state_dict(), 'optimizer_state_dict': self.optimizer_1.state_dict(), }, save_path) # Save normalizer state for inference normalizer.save_state( os.path.join(args.output_dir, 'normalizer_state.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables
class QAgent(BaseAgent): def __init__(self, config, environment): super(QAgent, self).__init__(config,environment) #self.history = History(config) # self.replay_memory = DQNReplayMemory(config) self.net = DQN(len(environment.n_actions), config) self.net.build() self.net.add_summary(["average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate"], ["ep_rewards", "ep_actions"]) self.account_profit_loss = 0. self.forecast_window=config.forecast_window self.close_attempts = 0 self.q_learning_rate=0.01 self.policy = self.make_epsilon_greedy_policy(self.QFunc, self.epsilon, len(self.env.n_actions)) def qcalc(self,l,s): r=0 for i in range(len(s)): r=r+l**i*s[i] return r def QFunc(self, s,theta): #print('s:',s) #print('theta:', theta) l=np.einsum('ji,i->j', np.transpose(theta), s) max=np.absolute(np.array(l)).max() if max!=0.: l = np.array(l) / max #print('l:', l) #hard code l to test: # # l[0]=0.7 # l[1] = 0.9 # l[4] = 0.9 # # l[2] = 0.6 # l[3] = 0.6 # # l[5] = 0.5 # l[6] = 0.5 q0= -0.5+l[0] #-self.qcalc(l[0],s ) q1 = self.qcalc(l[1], s ) #buy open q2 =0.5+ -1 * self.qcalc(l[2], s ) #sell close q3 =-0.5+self.qcalc(l[3], s ) #hold long q4 = -1 * self.qcalc(l[4], s ) # sell open q5 =-0.5+ self.qcalc(l[5], s ) # buy close q6 = 0.5+ -1* self.qcalc(l[6], s ) # hold short q=[q0, q1, q2, q3, q4, q5, q6] n = len(self.env.n_actions) actions = self.env.get_valid_actions() onehot = np.eye(n)*[1 if i in actions else 0 for i in range(n)] q=np.matmul(onehot,q) print('S:', s) print('Q:',q) return q def norm_it(self,d): x = np.diff(d.flatten()) return x / np.linalg.norm(x) def get_state(self,data): #using Kalman filter to get the true value/price in a minute observations, open,high,low,close prices d = np.array([data.get_t_data(-i)[2:6] for i in range(self.config.observation_window)]) mean = d.mean() kf = KalmanFilter(initial_state_mean=mean, n_dim_obs=4) v = kf.em(d) h=self.norm_it(v.smooth(d)[0]) return h def make_epsilon_greedy_policy(self, Q, epsilon, nA): """ Creates an epsilon-greedy policy based on a given Q-function and epsilon. Args: Q: A function returns a numpy array of length nA (see below) epsilon: The probability to select a random action . float between 0 and 1. nA: Number of actions in the environment. Returns: A function that takes the observation as an argument and returns the probabilities for each action in the form of a numpy array of length nA. """ def policy_fn(observation,theta): A = np.ones(nA, dtype=float) * epsilon / nA best_action = np.argmax(Q(observation,theta)) print("action called:",self.env.action_labels[best_action]) A[best_action] += (1.0 - epsilon) return A return policy_fn def train(self, steps, eventSource): render = False self.env.random_past_day() num_game, self.update_count, ep_reward = 0,0,0. total_reward, self.total_loss, self.total_q = 0.,0.,0. ep_rewards, actions = [], [] t = 0 self.theta = np.zeros([self.config.observation_window-1, len(self.env.n_actions)], dtype=np.float32) self.zetha = np.ones([self.config.observation_window-1, len(self.env.n_actions)], dtype=np.float32) self.discount = 0.99 s = self.get_state(self.env.data) action_probs = self.policy(s, self.theta) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) while action not in self.env.get_valid_actions(): action = np.random.choice(np.arange(len(action_probs)), p=action_probs) for self.i in tqdm(range(self.i, steps)): # end of day logic # if it is close to the end of day, we need to try to close out our position on a good term, # not to wait to be forced to close at the end of day, which is a fast rule of this algorithm # here is a logic, 10 allowances once within self.forecast_window minutes of closing minute, close on any change of the first nine # if the action needed agrees with the prediction, which is to close, execute it and then stay neutral the rest of day # or forcefully close the position at 10th allowance then stay neutral. grace_period = timedelta(minutes=15) end_time = datetime.strptime("16:00", '%H:%M') # print(end_time, self.time, end_time - self.time) if end_time - self.env.time < grace_period: self.close_attempts += 1 if (self.env.position > 0.): # a long position if action != self.env.action_labels.index("sell_close"): # close long if self.close_attempts > 10: action = self.env.action_labels.index("sell_close") if (self.env.position < 0.): # a short position if action != self.env.action_labels.index("buy_close"): # close a short if self.close_attempts > 10: action = self.env.action_labels.index("buy_close") if (self.env.position == 0.): action = self.env.action_labels.index("stay_neutral") if self.close_attempts > 10: if (self.env.position > 0.): # a long position action = self.env.action_labels.index("sell_close") # close long if (self.env.position < 0.): # a long position action = self.env.action_labels.index("buy_close") # close short # Beginning of day logic, don't trade the first fifteen minutes fifteen_minute = timedelta(minutes=15) start_time = datetime.strptime("9:30", '%H:%M') # print(end_time, self.time, end_time - self.time) if self.env.time - start_time <= fifteen_minute: action = self.env.action_labels.index("stay_neutral") self.env.step(action) print('action taken:',self.env.action_labels[action]) s_prime= self.get_state(self.env.data) if action not in ( self.env.action_labels.index("stay_neutral"), self.env.action_labels.index("hold_long"), self.env.action_labels.index("hold_short")): if action in (self.env.action_labels.index("buy_open"),self.env.action_labels.index("sell_open")): pl = - self.env.open_cost if action ==self.env.action_labels.index("sell_close"): pl=self.env.unit * (self.env.current_price - self.env.order_price) - self.env.open_cost if action ==self.env.action_labels.index("buy_close"): pl=-1*self.env.unit * (self.env.current_price - self.env.order_price) - self.env.open_cost self.log_trade(self.env.action_labels[action], self.env.today, self.env.time, self.env.unit, self.env.order_price, self.env.current_price,pl) self.account_profit_loss += pl forecasts = np.zeros(self.forecast_window, dtype=np.float32) forecast_history = np.zeros(self.forecast_window, dtype=np.float32) if render: sleep(0.2) eventSource.data_signal.emit(self.env.data, self.env.position, self.account_profit_loss, forecasts, forecast_history) delta = self.env.reward + self.QFunc(s_prime,self.theta)[action] - self.QFunc(s,self.theta)[action] #print('delta:',delta) # a = np.zeros([len(self.env.n_actions)]) # a[action] = self.q_learning_rate self.theta = self.theta + self.q_learning_rate * delta * self.zetha #print('theta 1:', self.theta ) rsum = np.absolute(self.theta).sum(axis=1) rsum[rsum == 0] = 1 self.theta = self.theta / rsum[:, None] #print('theta 2:', self.theta) self.zetha = self.discount * self.zetha + np.array(s)[:, None] #print('zetha:', self.zetha) if action in ( self.env.action_labels.index("stay_neutral"), self.env.action_labels.index("sell_close"), self.env.action_labels.index("buy_close")): #reset eligibility trace self.zetha = np.ones([self.config.observation_window - 1, len(self.env.n_actions)], dtype=np.float32) action_probs = self.policy(s_prime,self.theta) #print(action_probs) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) while action not in self.env.get_valid_actions(): action = np.random.choice(np.arange(len(action_probs)), p=action_probs) if self.env.terminal: t = 0 self.close_attempts = 0 self.env.random_past_day() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += self.env.reward t += 1 actions.append(action) total_reward += self.env.reward if self.i >= self.config.train_start: if self.i % self.config.test_step == self.config.test_step -1: avg_reward = total_reward / self.config.test_step avg_loss = self.total_loss / self.config.test_step avg_q = self.total_q / self.config.test_step try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 sum_dict = { 'average_reward': avg_reward, 'average_loss': avg_loss, 'average_q': avg_q, 'ep_max_reward': max_ep_reward, 'ep_min_reward': min_ep_reward, 'ep_num_game': num_game, 'learning_rate': self.net.learning_rate, 'ep_rewards': ep_rewards, 'ep_actions': actions } print('log to tensorboard at:', self.i) self.net.inject_summary(sum_dict, self.i) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] if self.i % 500000 == 0 and self.i > 0: j = 0 print('Saving the parameters at:',self.i) self.save() if self.i % 100000 == 0: j = 0 render = True if render: #self.env_wrapper.env.render() j += 1 if j == 1000: render = False def play(self, episodes, eventSource): #self.net.restore_session() self.env.random_past_day() i = 0 #for _ in range(self.config.history_len): # self.history.add(self.env.data) episode_steps = 0 while i < episodes: s = self.get_state(self.env.data) action_probs = self.policy(s, self.theta) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) while action not in self.env.get_valid_actions(): action = np.random.choice(np.arange(len(action_probs)), p=action_probs) self.env.step(action) # end of day logic # if it is close to the end of day, we need to try to close out our position on a good term, # not to wait to be forced to close at the end of day, which is a fast rule of this algorithm # here is a logic, 10 allowances once within self.forecast_window minutes of closing minute, close on any change of the first nine # if the action needed agrees with the prediction, which is to close, execute it and then stay neutral the rest of day # or forcefully close the position at 10th allowance then stay neutral. grace_period = timedelta(minutes=15) end_time = datetime.strptime("16:00", '%H:%M') # print(end_time, self.time, end_time - self.time) if end_time - self.env.time < grace_period: if (self.env.position > 0.): # a long position if action != self.env.action_labels.index("sell_close"): # close long self.close_attempts += 1 if self.close_attempts > 10: action = self.env.action_labels.index("sell_close") if (self.env.position < 0.): # a short position if action != self.env.action_labels.index("buy_close"): # close a short if self.close_attempts > 10: action = self.env.action_labels.index("buy_close") if (self.env.position == 0.): action = self.env.action_labels.index("stay_neutral") if self.close_attempts > 10: if (self.env.position > 0.): # a long position action = self.env.action_labels.index("sell_close") # close long if (self.env.position < 0.): # a long position action = self.env.action_labels.index("buy_close") # close short # Beginning of day logic, don't trade the first fifteen minutes fifteen_minute = timedelta(minutes=15) start_time = datetime.strptime("9:30", '%H:%M') # print(end_time, self.time, end_time - self.time) if self.env.time - start_time <= fifteen_minute: action = self.env.action_labels.index("stay_neutral") self.env.step(action) if action not in (self.env.action_labels.index("stay_neutral"), self.env.action_labels.index("hold_long"), self.env.action_labels.index("hold_short")): if action in (self.env.action_labels.index("buy_open"), self.env.action_labels.index("sell_open")): pl = - self.env.open_cost if action == self.env.action_labels.index("sell_close"): pl = self.env.unit * (self.env.current_price - self.env.order_price) - self.env.open_cost if action == self.env.action_labels.index("buy_close"): pl = -1 * self.env.unit * (self.env.current_price - self.env.order_price) - self.env.open_cost self.log_trade(self.env.action_labels[action], self.env.today, self.env.time, self.env.unit, self.env.order_price, self.env.current_price, pl) self.account_profit_loss += pl forecasts = np.zeros(self.forecast_window, dtype=np.float32) forecast_history = np.zeros(self.forecast_window, dtype=np.float32) sleep(1) eventSource.data_signal.emit(self.env.data, self.env.position, self.account_profit_loss, forecasts, forecast_history) episode_steps += 1 if episode_steps > self.config.max_steps: self.env.terminal = True if self.env.terminal: episode_steps = 0 i += 1 self.env.random_past_day()
movement_cfg_predict.weight_folder = movement_cfg_training.weight_folder movement_cfg_predict.log_folder = movement_cfg_training.log_folder angle_gen_weight_path, angle_gen_log_path, load_angle_weights = create_nn_folders( angle_cfg_training, angle_weight_path, angle_log_path, False) angle_cfg_predict.weight_folder = angle_cfg_training.weight_folder angle_cfg_predict.log_folder = angle_cfg_training.log_folder # with tf.name_scope("Train"): with tf.variable_scope("MovementModel", reuse=None, initializer=initializer_movement) as scope: training_network_movement = DQN( config=movement_cfg_training, namespace="DQN", is_training=True, log_path=movement_cfg_training.log_folder, weight_path=movement_cfg_training.weight_folder, variable_scope=scope, num_actions=env.num_actions, num_states=env.num_states) with tf.variable_scope("MovementModel", reuse=True, initializer=initializer_movement) as scope: predict_network_movement = DQN( config=movement_cfg_predict, namespace="DQN", is_training=False, log_path=movement_cfg_predict.log_folder, weight_path=movement_cfg_predict.weight_folder, variable_scope=scope, num_actions=env.num_actions,
class DQNAgent(BaseAgent): def __init__(self, config, environment): super(DQNAgent, self).__init__(config, environment) #self.history = History(config) self.replay_memory = DQNReplayMemory(config) self.net = DQN(len(environment.n_actions), config) self.net.build() self.net.add_summary([ "average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate" ], ["ep_rewards", "ep_actions"]) self.account_profit_loss = 0. self.forecast_window = config.forecast_window def observe(self): reward = max(self.min_reward, min(self.max_reward, self.env.reward)) data = self.env.data self.stoday = datetime.strftime(self.env.today, '%m/%d/%Y') todays = [date == self.stoday for date in data.dates] self.replay_memory.add(data.opens, data.highs, data.lows, data.closes, data.volumes, todays, reward, self.env.action, self.env.terminal, self.env.position, self.env.today, self.env.order_price, self.env.current_price, self.env.time_since) if self.i < self.config.epsilon_decay_episodes: self.epsilon -= self.config.epsilon_decay if self.i % self.config.train_freq == 0 and self.i > self.config.train_start: opens, highs, lows, closes, volumes, todays, action, reward, \ opens_, highs_, lows_, closes_, volumes_, todays_, \ terminal,positions,dates,order_prices,current_prices,\ time_steps_since = self.replay_memory.sample_batch() q, loss = self.net.train_on_batch_target( opens, highs, lows, closes, volumes, todays, action, reward, opens_, highs_, lows_, closes_, volumes_, todays_, terminal, self.i, positions, dates, order_prices, current_prices, time_steps_since) self.total_q += q self.total_loss += loss self.update_count += 1 if self.i % self.config.update_freq == 0: print('update the training target at:', self.i) self.net.update_target() def policy(self): if np.random.rand() < self.epsilon: action = np.random.choice(self.env.get_valid_actions()) return action else: feed_dict = self.net.get_feed_dict( self.env.data, self.env.reward, self.env.action, self.env.terminal, self.env.position, self.env.today, self.env.order_price, self.env.current_price, self.env.time_since) a = self.net.q_action.eval(feed_dict=feed_dict, session=self.net.sess) action = a[0] while action not in self.env.get_valid_actions(): #print('invalid action called:',action) action = np.random.choice(self.env.get_valid_actions()) return action def train(self, steps, eventSource): render = False self.env.random_past_day() num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] t = 0 #for _ in range(self.config.history_len): # self.history.add(self.env.data) for self.i in tqdm(range(self.i, steps)): action = self.policy() self.env.step(action) self.observe() if action in (1, 2): direction = 1 if action == 2 else -1 pl = direction * self.env.unit * ( self.env.current_price - self.env.order_price) - self.env.open_cost self.log_trade(action, self.env.today, self.env.time, self.env.unit, self.env.order_price, self.env.current_price, pl) self.account_profit_loss += pl forecasts = np.zeros(self.forecast_window, dtype=np.float32) forecast_history = np.zeros(self.forecast_window, dtype=np.float32) sleep(1) eventSource.data_signal.emit(self.env.data, self.env.position, self.account_profit_loss, forecasts, forecast_history) if self.env.terminal: t = 0 self.env.random_past_day() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += self.env.reward t += 1 actions.append(action) total_reward += self.env.reward if self.i >= self.config.train_start: if self.i % self.config.test_step == self.config.test_step - 1: avg_reward = total_reward / self.config.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 sum_dict = { 'average_reward': avg_reward, 'average_loss': avg_loss, 'average_q': avg_q, 'ep_max_reward': max_ep_reward, 'ep_min_reward': min_ep_reward, 'ep_num_game': num_game, 'learning_rate': self.net.learning_rate, 'ep_rewards': ep_rewards, 'ep_actions': actions } print('log to tensorboard at:', self.i) self.net.inject_summary(sum_dict, self.i) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] if self.i % 500000 == 0 and self.i > 0: j = 0 print('Saving the parameters at:', self.i) self.save() if self.i % 100000 == 0: j = 0 render = True if render: #self.env_wrapper.env.render() j += 1 if j == 1000: render = False def play(self, episodes, net_path): self.net.restore_session(path=net_path) self.env.new_game() i = 0 #for _ in range(self.config.history_len): # self.history.add(self.env.data) episode_steps = 0 while i < episodes: feed_dict = self.net.get_feed_dict( self.env.data, self.env.reward, self.env.action, self.env.terminal, self.env.position, self.env.today, self.env.order_price, self.env.current_price, self.env.time_since) a = self.net.q_action.eval(feed_dict=feed_dict, session=self.net.sess) action = a[0] self.env.step(action) #self.history.add(self.env.data) episode_steps += 1 if episode_steps > self.config.max_steps: self.env.terminal = True if self.env.terminal: episode_steps = 0 i += 1 self.env.new_play_game()
mov_gen_weight_path, mov_gen_log_path, load_movement_weights = create_nn_folders(movement_cfg_training, movement_weight_path, movement_log_path, False) movement_cfg_predict.weight_folder = movement_cfg_training.weight_folder movement_cfg_predict.log_folder = movement_cfg_training.log_folder angle_gen_weight_path, angle_gen_log_path, load_angle_weights = create_nn_folders(angle_cfg_training, angle_weight_path, angle_log_path) angle_cfg_predict.weight_folder = angle_cfg_training.weight_folder angle_cfg_predict.log_folder = angle_cfg_training.log_folder # with tf.name_scope("Train"): with tf.variable_scope("MovementModel", reuse=None, initializer=initializer_movement) as scope: predict_network_movement = DQN(config=movement_cfg_predict, namespace="DQN", is_training=False, log_path=movement_cfg_predict.log_folder, weight_path=movement_cfg_predict.weight_folder, variable_scope=scope, num_actions=env.num_actions, num_states=env.num_states) # with tf.name_scope("Train"): with tf.variable_scope("Angle_Model_Discrete", reuse=None, initializer=initializer_angle) as angle_scope: angle_training_network = GeneralRRNDiscreteModelMultitaskJointLoss(num_drones=sim_cfg.num_drones, config=angle_cfg_training, namespace="angle", is_training=True, log_path=angle_cfg_training.log_folder, weight_path=angle_cfg_training.weight_folder, variable_scope=angle_scope) with tf.variable_scope("Angle_Model_Discrete", reuse=True, initializer=initializer_angle) as angle_scope: angle_predict_network = GeneralRRNDiscreteModelMultitaskJointLoss(num_drones=sim_cfg.num_drones, config=angle_cfg_predict,