def __init__(self, num_actors, env_kwargs, env_interface, run_name='temp', load_model=False): self.num_actors = num_actors self.pipes = [] self.processes = [] self.threads = [] self.trajectory_queue = [] self.name = run_name project_root = os.path.dirname(os.path.realpath(__file__)) self.save_dir = os.path.join(project_root, 'saves', run_name) self.weights_dir = os.path.join(self.save_dir, 'weights') self.code_dir = os.path.join(self.save_dir, 'code') self.weights_path = os.path.join(self.weights_dir, 'weights', 'model.ckpt') if not os.path.exists(self.weights_dir): os.makedirs(self.weights_dir) if not os.path.exists(self.code_dir): os.makedirs(self.code_dir) os.system('cp -r ' + os.path.join(project_root, './*.py') + ' ' + self.code_dir) self.rewards_path = os.path.join(self.save_dir, 'rewards.txt') self.epoch = 0 self.env_kwargs = env_kwargs self.discount_factor = 0.95 self.td_lambda = 0.95 self.env_interface = env_interface self.agent = LSTMAgent(self.env_interface) with self.agent.graph.as_default(): self.rewards_input = tf.placeholder(tf.float32, [None], name="rewards") # T self.behavior_log_probs_input = tf.placeholder( tf.float32, [None], name="behavior_log_probs") # T self.loss = self._ac_loss() # self.loss = self._impala_loss() self.train_op = tf.train.AdamOptimizer(0.0003).minimize(self.loss) self.session = self.agent.session self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() if load_model: try: self._load_model() except Exception: print("Could not load model")
class Actor: def __init__(self, pipe, env_interface, env_kwargs): self.env_interface = env_interface self.agent = LSTMAgent(self.env_interface) with self.agent.graph.as_default(): self.session = self.agent.session self.session.run(tf.global_variables_initializer()) self.variable_names = [v.name for v in tf.trainable_variables()] self.assign_placeholders = {t.name: tf.placeholder(t.dtype, t.shape) for t in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)} self.assigns = [tf.assign(tensor, self.assign_placeholders[tensor.name]) for tensor in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] self.env_interface = env_interface self.env = SCSingleEnvironment(self.env_interface, env_kwargs=env_kwargs) # self.env = MultipleEnvironment(lambda: SCEnvironmentWrapper(self.env_interface, env_kwargs=env_kwargs), # num_instance=1) self.curr_iteration = 0 self.pipe = pipe def generate_trajectory(self): """ Repeatedly generates actions from the agent and steps in the environment until all environments have reached a terminal state. Returns each trajectory in the form of rollouts. """ states, masks, _, _ = self.env.reset() memory = None rollout = Rollout() while True: action_indices, memory, log_probs = self.agent.step(states, masks, memory) new_states, new_masks, rewards, dones = self.env.step(action_indices) rollout.add_step(states[0], action_indices[0], rewards[0], masks[0], dones[0], log_probs[0]) states = new_states masks = new_masks if all(dones): # Add in the done state for rollouts which just finished for calculating the bootstrap value. rollout.add_step(states[0]) break self.curr_iteration += 1 print("=============== Reward on iteration %d is [%.1f]" % (self.curr_iteration, rollout.total_reward())) return rollout def get_params(self): self.pipe.send(("get_params", None)) names_to_params = self.pipe.recv() with self.agent.graph.as_default(): self.session.run(self.assigns, feed_dict={ self.assign_placeholders[name]: names_to_params[name] for name in self.variable_names }) def send_trajectory(self, trajectory): # print("[ACTOR] sending trajectory:") self.pipe.send(("add_trajectory", trajectory))
def __init__(self, pipe, env_interface, env_kwargs): self.env_interface = env_interface self.agent = LSTMAgent(self.env_interface) with self.agent.graph.as_default(): self.session = self.agent.session self.session.run(tf.global_variables_initializer()) self.variable_names = [v.name for v in tf.trainable_variables()] self.assign_placeholders = {t.name: tf.placeholder(t.dtype, t.shape) for t in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)} self.assigns = [tf.assign(tensor, self.assign_placeholders[tensor.name]) for tensor in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)] self.env_interface = env_interface self.env = SCSingleEnvironment(self.env_interface, env_kwargs=env_kwargs) # self.env = MultipleEnvironment(lambda: SCEnvironmentWrapper(self.env_interface, env_kwargs=env_kwargs), # num_instance=1) self.curr_iteration = 0 self.pipe = pipe
def main(_): env_interface = EmbeddingInterfaceWrapper(BeaconEnvironmentInterface()) # env_interface = EmbeddingInterfaceWrapper(TrainMarines()) # learner = Learner(10, env_kwargs, env_interface, run_name="MineralWithBeacon2", load_name="Beacon2", load_model=True) # env_interface = EmbeddingInterfaceWrapper(BeaconEnvironmentInterface()) # learner = NormalLearner(env_interface, load_model=False) # learner.train() # Refresh environment every once in a while to deal with memory leak environment = MultipleEnvironment(lambda: SCEnvironmentWrapper(env_interface, env_kwargs), num_instance=1) agent = LSTMAgent(env_interface) learner = ActorCriticLearner(environment, agent, run_name="SyncMarines", load_model=False) i = 0 while True: i += 1 print(i) learner.train_episode()
class Learner: def __init__(self, num_actors, env_kwargs, env_interface, run_name='temp', load_name=None, load_model=False): self.num_actors = num_actors self.pipes = [] self.processes = [] self.threads = [] self.trajectory_queue = [] self.name = run_name if load_name is None: self.load_name = run_name else: self.load_name = load_name project_root = os.path.dirname(os.path.realpath(__file__)) self.save_dir = os.path.join(project_root, 'saves', run_name) self.weights_path_load = os.path.join(project_root, 'saves', self.load_name, 'weights') self.code_dir = os.path.join(self.save_dir, 'code') self.weights_dir = os.path.join(self.save_dir, 'weights') self.weights_path = os.path.join(self.weights_dir, 'model.ckpt') # body_keywords = ["pointer_head/dense/", "pointer_head/dense_1/", "shared", "lstm"] if not os.path.exists(self.weights_dir): os.makedirs(self.weights_dir) if not os.path.exists(self.code_dir): os.makedirs(self.code_dir) os.system('cp -r ' + os.path.join(project_root, './*.py') + ' ' + self.code_dir) self.rewards_path = os.path.join(self.save_dir, 'rewards.txt') self.epoch = 0 self.env_kwargs = env_kwargs self.discount_factor = 0.95 self.td_lambda = 0.95 self.env_interface = env_interface self.agent = LSTMAgent(self.env_interface) with self.agent.graph.as_default(): self.rewards_input = tf.placeholder(tf.float32, [None], name="rewards") # T self.behavior_log_probs_input = tf.placeholder( tf.float32, [None], name="behavior_log_probs") # T self.loss = self._ac_loss() # self.loss = self._impala_loss() head_variables = [ v for v in tf.trainable_variables() if "shared" not in v.name ] for var in head_variables: print(var) print("body variables are") body_variables = [ v for v in tf.trainable_variables() if "shared" in v.name ] for var in body_variables: print(var) self.train_op = tf.train.AdamOptimizer(0.0003).minimize(self.loss) self.session = self.agent.session # self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() if load_model: try: self._load_model() except Exception: print("Could not load model") # self.session.run(tf.initialize_variables(head_variables)) def start_children(self): for process_id in range(self.num_actors): parent_conn, child_conn = Pipe() self.pipes.append(parent_conn) p = Process(target=run_actor, args=(partial(Actor, child_conn, self.env_interface, self.env_kwargs), )) self.processes.append(p) p.start() for i in range(self.num_actors): t = Thread(target=learn, args=(self, self.pipes[i])) self.threads.append(t) t.start() def train(self): print("Starting training") self.start_children() print("Finished starting children") while True: # print("[Learner] Sleeping") time.sleep(0.01) if len(self.trajectory_queue) >= 1: self.update_model(self.trajectory_queue) self.trajectory_queue = [] def add_trajectory(self, trajectory): self.update_model([trajectory]) # self.trajectory_queue.append(trajectory) def update_model(self, rollouts): for i in range(len(rollouts)): rollout = rollouts[i] if rollout.done: feed_dict = { self.rewards_input: rollout.rewards, # self.behavior_log_probs_input: rollout.log_probs, **self.agent.get_feed_dict(rollout.states, rollout.masks, rollout.actions, rollout.bootstrap_state) } loss, _ = self.session.run([self.loss, self.train_op], feed_dict=feed_dict) self.epoch += 1 if self.epoch % 50 == 0: self.save_model() with open(self.rewards_path, 'a+') as f: for r in rollouts: f.write('%d\n' % r.total_reward()) def save_model(self): """ Saves the current model weights in current `save_path`. """ save_path = self.saver.save(self.session, self.weights_path) print("Model Saved in %s" % save_path) def _load_model(self): """ Loads the model from weights stored in the current `save_path`. """ self.saver.restore(self.session, self.weights_path_load) print('Model Loaded from ', self.weights_path_load) def _impala_loss(self): num_steps = tf.shape(self.rewards_input)[0] discounts = tf.ones((num_steps, 1)) * self.discount_factor rewards = tf.expand_dims(self.rewards_input, axis=-1) values = tf.expand_dims(self.agent.train_values(), axis=-1) bootstrap = tf.expand_dims(self.agent.bootstrap_value(), axis=-1) train_log_probs = self.agent.train_log_probs() log_rhos = tf.expand_dims(train_log_probs - self.behavior_log_probs_input, axis=-1) vs, advantage = trfl.vtrace_from_importance_weights( log_rhos, discounts, rewards, values, bootstrap) loss_actor = tf.reduce_mean(-tf.stop_gradient(advantage) * train_log_probs) loss_critic = tf.reduce_mean((vs - values)**2) result = loss_actor + 0.5 * loss_critic return result def _ac_loss(self): num_steps = tf.shape(self.rewards_input)[0] discounts = tf.ones((num_steps, 1)) * self.discount_factor rewards = tf.expand_dims(self.rewards_input, axis=1) values = tf.expand_dims(self.agent.train_values(), axis=1) bootstrap = tf.expand_dims(self.agent.bootstrap_value(), axis=0) glr = trfl.generalized_lambda_returns(rewards, discounts, values, bootstrap, lambda_=self.td_lambda) advantage = tf.squeeze(glr - values) loss_actor = tf.reduce_mean(-tf.stop_gradient(advantage) * self.agent.train_log_probs()) loss_critic = tf.reduce_mean(advantage**2) result = loss_actor + 0.5 * loss_critic return result
#def main(): env = Env(trader=trader, symbol=symbol, commission=commission, action_space=action_space, share=exe_shares, time_total=execute_time, time_steps=exe_times, objPrice=exe_price, close_price_volumn=num_states) agent = LSTMAgent(sess_=sess, observations_dim=num_states + 2, action_space=action_space, batch_size=batch_size, Q_function=Qf.ann, optimizer=tf.train.AdamOptimizer, GAMMA=GAMMA, EPSILON=EPSILON, LOAD=load, learning_rate=0.001) pool = SimpleReplayPool(max_pool_size=1000, pop_size=100) for i in range(EPISODES): # Deal with the initialization for each episode print("*" * 100) print(f'THE {i+1} EPISODE \n\n') sum_rew4epi = 0 if i % 2 == 1: bp = trader.get_best_price(symbol) exe_price = bp.get_bid_price() env.set_objective(share=-exe_shares,
def __init__(self, num_actors, env_kwargs, env_interface, run_name='temp', load_name=None): self.num_actors = num_actors self.pipes = [] self.processes = [] self.threads = [] self.trajectory_queue = [] self.name = run_name if load_name is None: self.load_name = run_name else: self.load_name = load_name project_root = os.path.dirname(os.path.realpath(__file__)) self.save_dir = os.path.join(project_root, 'saves', run_name) self.weights_path_load = os.path.join(project_root, 'saves', self.load_name, 'weights', 'model.ckpt') self.code_dir = os.path.join(self.save_dir, 'code') self.weights_dir = os.path.join(self.save_dir, 'weights') self.weights_path = os.path.join(self.weights_dir, 'model.ckpt') # self.replay_dir = os.path.join(self.load_name, 'replays') # body_keywords = ["pointer_head/dense/", "pointer_head/dense_1/", "shared", "lstm"] if not os.path.exists(self.weights_dir): os.makedirs(self.weights_dir) if not os.path.exists(self.code_dir): os.makedirs(self.code_dir) # if not os.path.exists(self.replay_dir): # os.makedirs(self.replay_dir) os.system('cp -r ' + os.path.join(project_root, './*.py') + ' ' + self.code_dir) self.rewards_path = os.path.join(self.save_dir, 'rewards.txt') self.epoch = 0 self.env_kwargs = env_kwargs # self.env_kwargs['replay_dir'] = self.replay_dir print("Asdfasd") print(self.env_kwargs) self.discount_factor = 0.95 self.td_lambda = 0.95 self.env_interface = env_interface self.agent = LSTMAgent(self.env_interface) with self.agent.graph.as_default(): self.rewards_input = tf.placeholder(tf.float32, [None], name="rewards") # T self.behavior_log_probs_input = tf.placeholder(tf.float32, [None], name="behavior_log_probs") # T self.loss = self._ac_loss() # self.loss = self._impala_loss() # head_variables = [v for v in tf.trainable_variables() if "shared" not in v.name] # for var in head_variables: # print(var) # # print("body variables are") # body_variables = [v for v in tf.trainable_variables() if "shared" in v.name] # for var in body_variables: # print(var) self.train_op = tf.train.AdamOptimizer(0.0003).minimize(self.loss) self.session = self.agent.session self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() try: self._load_model() except Exception as e: print(e) print("Could not load model")