class Agent(object): def __init__(self): self.epsilon = EPSILON_START self.value_network = ValueNetwork() def train(self, current_state, reward, next_state): self.value_network.train(current_state, reward, next_state) def get_reward(self, state): return self.value_network.get_reward(state) def get_next_state(self, current_state): if random.random() < self.epsilon: # explore print('RANDOM STATE ', random.choice(CURRENT_STATE_TO_NEXT_STATE[current_state])) return random.choice(CURRENT_STATE_TO_NEXT_STATE[current_state]) else: # exploit print('EXploit Action ', ) return self.value_network.get_best_next_state(current_state) def get_states_value(self): return self.value_network.state_value def update_epsilon(self): if self.epsilon - EPSILON_CHANGE_RATE > MINIMUM_EPSILON: self.epsilon -= EPSILON_CHANGE_RATE else: self.epsilon = MINIMUM_EPSILON
def _start(self): logger.info('start...') np.random.seed(os.getpid()) logger.info('random:%s', [np.random.random() for _ in range(3)]) value_model = ValueNetwork(hidden_activation='selu', output_activation='sigmoid') for i in range(self.begin, 2**32): logger.info('simulate %s', i) self.episode = i self.decay_epsilon() board = rule.random_init_board( ) if self.init_board == 'random' else rule.init_board() player = 1 # value_model_params = self.model_queue.get() with self.weight_lock: value_model.model.load_weights(self.weights_file) ts = MCTS(init_board=board.copy(), player=player, policy_model=None, value_model=value_model, max_search=50, min_search_time=0, scene=Scene.TRAIN) ts.decay_t(episode=i) records, winner = self.simulate(ts, board, player) if records.length() == 0: continue self.record_queue.put(records) if i % 100 == 0: records.save('records/train/alpha0/1st_%03d_' % (i // 100))
def expansion(self): board, player = self.board, self.player actions = rule.valid_actions(board, player) # actions_ = list(filter(lambda a:(self.board_str, *a) not in walked, actions)) # if len(actions_) == 0: # 全部已经走过,重新选 # actions_ = actions if self.player == self.tree.player: with self.tree.value_model_lock: values = [ self.tree.value_model.q(board, from_, act) for from_, act in actions ] else: with self.tree.opp_value_model_lock: values = [ self.tree.opp_value_model.q(board, from_, act) for from_, act in actions ] probs = ValueNetwork.value_to_probs(values) for a, v, p in zip(actions, values, probs): e = Edge(upper_node=self, a=a, v=v, p=p, lambda_=self.tree.lambda_) self.add_edge(e) self.expanded = True
def __init__(self, stock_code, chart_data, training_data=None, policy_model_path=None, value_model_path=None, lr=0.001, discount_factor=0.5, start_epsilon=0, num_past_input=0, load_weight_and_learn=False): self.stock_code = stock_code # 종목코드 self.chart_data = chart_data self.environment = Environment(chart_data) # 환경 객체 # 에이전트 객체 self.agent = Agent(self.environment) self.training_data = training_data # 학습 데이터 self.training_data_idx = -1 self.state = None self.action_size = self.agent.NUM_ACTIONS self.discount_factor = discount_factor self.start_epsilon = start_epsilon self.num_past_input = num_past_input self.load_weight_and_learn = load_weight_and_learn # 정책/가치 신경망; 입력 크기 = 학습 데이터의 크기 #+ 에이전트 상태 크기 self.num_features = self.training_data.shape[1] * ( 1 + num_past_input) #+ self.agent.STATE_DIM self.policy_network_obj = PolicyNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=lr) if load_weight_and_learn is True: self.policy_network_obj.model = load_model(policy_model_path) self.policy_network = self.policy_network_obj.model else: self.policy_network = self.policy_network_obj.make_model() self.value_network_obj = ValueNetwork(input_dim=self.num_features, lr=lr) if load_weight_and_learn is True: self.value_network_obj.model = load_model(value_model_path) self.value_network = self.value_network_obj.model else: self.value_network = self.value_network_obj.make_model() self.policy_updater = self.policy_optimizer() self.value_updater = self.value_optimizer()
def __init__(self, alpha, input_size, output_size): self.buffer = Buffer() self.value_network = ValueNetwork(alpha, input_size=input_size, output_size=1) self.policy_network = PolicyNetwork(0.0001, input_size=input_size, output_size=output_size) self.old_policy_network = PolicyNetwork(0.0001, input_size=input_size, output_size=output_size) # store policy state self.buffer.store_parameters(self.policy_network.state_dict()) self.avg_rewards = []
def __init__(self, player, init_board, first_player, policy_model, value_model, expansion_gate=50, lambda_=0.5, min_search=500, max_search=10000, min_search_time=15, max_search_time=300, scene=Scene.PLAY, t=100, t_decay=0.0002): self.player = player self.expansion_gate = expansion_gate self.lambda_ = lambda_ self.min_search = min_search # 最小的搜索次数 self.max_search = max_search # 最大的搜索次数(超过即停止搜索) self.max_search_time = max_search_time # 最大搜索时间(s)(超过即停止搜索) self.min_search_time = min_search_time # 最小搜索时间(s)(同时满足最小搜索次数和最小搜索时间时,停止搜索) self.searching = False self.stop = False self.stop_event = Event() self.depth = 0 self.n_node = 0 self.n_search = 0 self.scene = scene self.t = t # 训练的温度,逐渐减小,用以控制概率 self._t = t self.t_decay = t_decay logger.info('value_model: %s', value_model) if isinstance(value_model, str): value_model = DQN( hidden_activation='relu', lr=0.001, weights_file=value_model ) if 'DQN' in value_model else ValueNetwork( hidden_activation='relu', lr=0.001, weights_file=value_model) # self.policy = value_model logger.info('value_model: %s', value_model) self.value_model = value_model self.value_model_lock = threading.Lock() # 模拟对手走棋时用的Q值由opp_value预测,走棋的同时学习对手走棋的习惯 self.opp_value_model = type(value_model)(hidden_activation='relu', lr=0.001) self.opp_value_model.copy(value_model) logger.info('opp_value_model: %s', self.opp_value_model) self.opp_value_model_lock = threading.Lock() self.root = Node(init_board, first_player, tree=self) self.predicted = set() # 树中已经走过的走法 (board_str, from, act) self.root.expansion() self.opp_actions = {} # 对手走过的棋局, {board:{action:n}]
def train(): from multiprocessing import Queue, Lock import os record_queue = Queue() model_queue = Queue() weight_lock = Lock() weights_file = 'model/alpha0/weights' model_file = 'model/alpha0/value_network_00067h.model' begin = 6700 _value_model = ValueNetwork(hidden_activation='selu', output_activation='sigmoid', model_file=model_file) value_model = ValueNetwork(hidden_activation='selu', output_activation='sigmoid') value_model.copy(_value_model) if os.path.exists(weights_file): value_model.model.load_weights(weights_file) else: value_model.model.save_weights(weights_file) for _ in range(3): SimulateProcess(record_queue, model_queue, weight_lock, weights_file, init_board='random', epsilon=1.0, epsilon_decay=2e-3, begin=begin // 3).start() for i in range(begin + 1, 2**32): records = record_queue.get() logger.info('train %s, records:%s', i, len(records)) value_model.train(records, epochs=5) with weight_lock: value_model.model.save_weights(weights_file) if i % 100 == 0: value_model.save_model('model/alpha0/value_network_%05dh.model' % (i // 100))
def load_model(self): return ValueNetwork(hidden_activation=self.hidden_activation, output_activation='sigmoid', model_file=self.model_file, weights_file=self.weights_file)
def __init__(self): self.epsilon = EPSILON_START self.value_network = ValueNetwork()
class TRPO(object): def __init__(self, alpha, input_size, output_size): self.buffer = Buffer() self.value_network = ValueNetwork(alpha, input_size=input_size, output_size=1) self.policy_network = PolicyNetwork(0.0001, input_size=input_size, output_size=output_size) self.old_policy_network = PolicyNetwork(0.0001, input_size=input_size, output_size=output_size) # store policy state self.buffer.store_parameters(self.policy_network.state_dict()) self.avg_rewards = [] def update(self, iter=80): observations = self.buffer.get_observations() #actions = self.buffer.get_actions() rewards = self.buffer.get_rewards() advantages = self.buffer.get_advantages() log_probs = self.buffer.get_log_probs() self.old_policy_network.load_state_dict(self.buffer.old_parameters) old_pred = self.old_policy_network.forward(observations) old_action_probabilities = torch.distributions.Categorical(old_pred) old_action = old_action_probabilities.sample() old_probs = old_action_probabilities.log_prob(old_action).reshape( -1, 1) self.buffer.store_parameters(self.policy_network.state_dict()) self.policy_network.optimize(log_probs, old_probs, advantages) self.value_network.optimize(observations, rewards, iter=iter) def calculate_advantage(self): prev_observation = self.buffer.observation_buffer[-2] observation = self.buffer.observation_buffer[-1] v1 = self.value_network(prev_observation) v2 = self.value_network(observation) return self.buffer.reward_buffer[-1] + v2 - v1 def act(self, observation): prediction = self.policy_network.forward(observation) action_probabilities = torch.distributions.Categorical(prediction) action = action_probabilities.sample() log_prob = action_probabilities.log_prob(action) self.buffer.store_log_prob(log_prob) return action.item(), log_prob def discount_rewards(self, step): for s in reversed(range(1, step + 1)): update = 0 for k in reversed(range(1, s + 1)): update += self.buffer.reward_buffer[-k] * (0.99**k) self.buffer.reward_buffer[-s] += update def train(self, env, epochs=1000, steps=4000): plt.ion() for epoch in range(epochs): observation = env.reset() self.buffer.store_observation(observation) step = 0 for step in range(steps): step += 1 action, log_prob = self.act(observation) self.buffer.store_action(log_prob) observation, reward, done, info = env.step(action) self.buffer.store_reward(reward / 200 + observation[0] / 2 + (1 * observation[1])**2) #env.render() self.buffer.store_observation(observation) advantage = self.calculate_advantage() self.buffer.store_advantage(advantage) if done or step == steps - 1: observation = env.reset() self.discount_rewards(step) step = 0 self.update(iter=5) rwrd = self.buffer.get_rewards() self.avg_rewards.append((torch.sum(rwrd) / rwrd.shape[0]).numpy()) self.buffer.clear_buffer() print("Average Reward: {}".format(self.avg_rewards[-1])) plt.title("Reward per Epoch") plt.xlabel("Epoch") plt.ylabel("Reward") plt.plot(self.avg_rewards, label="average reward") plt.legend(loc="upper left") plt.draw() plt.pause(0.0001) plt.clf()
class PolicyLearner: def __init__(self, stock_code, chart_data, training_data=None, policy_model_path=None, value_model_path=None, lr=0.001, discount_factor=0.5, start_epsilon=0, num_past_input=0, load_weight_and_learn=False): self.stock_code = stock_code # 종목코드 self.chart_data = chart_data self.environment = Environment(chart_data) # 환경 객체 # 에이전트 객체 self.agent = Agent(self.environment) self.training_data = training_data # 학습 데이터 self.training_data_idx = -1 self.state = None self.action_size = self.agent.NUM_ACTIONS self.discount_factor = discount_factor self.start_epsilon = start_epsilon self.num_past_input = num_past_input self.load_weight_and_learn = load_weight_and_learn # 정책/가치 신경망; 입력 크기 = 학습 데이터의 크기 #+ 에이전트 상태 크기 self.num_features = self.training_data.shape[1] * ( 1 + num_past_input) #+ self.agent.STATE_DIM self.policy_network_obj = PolicyNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=lr) if load_weight_and_learn is True: self.policy_network_obj.model = load_model(policy_model_path) self.policy_network = self.policy_network_obj.model else: self.policy_network = self.policy_network_obj.make_model() self.value_network_obj = ValueNetwork(input_dim=self.num_features, lr=lr) if load_weight_and_learn is True: self.value_network_obj.model = load_model(value_model_path) self.value_network = self.value_network_obj.model else: self.value_network = self.value_network_obj.make_model() self.policy_updater = self.policy_optimizer() self.value_updater = self.value_optimizer() # 정책신경망을 업데이트하는 함수 def policy_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantage = K.placeholder(shape=[ None, ]) action_prob = K.sum(action * self.policy_network.output, axis=1) cross_entropy = K.log(action_prob) * advantage loss = -K.sum(cross_entropy) optimizer = Nadam(lr=self.policy_network_obj.lr) updates = optimizer.get_updates(self.policy_network.trainable_weights, [], loss) train = K.function([self.policy_network.input, action, advantage], [], updates=updates) return train # 가치신경망을 업데이트하는 함수 def value_optimizer(self): target = K.placeholder(shape=[ None, ]) loss = K.mean(K.square(target - self.value_network.output)) optimizer = Nadam(lr=self.value_network_obj.lr) updates = optimizer.get_updates(self.value_network.trainable_weights, [], loss) train = K.function([self.value_network.input, target], [], updates=updates) return train # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, state, action, action_idx, reward, next_state, done): value = self.value_network_obj.predict(state)[0] #next_value = self.value_network_obj.predict(next_state)[0] next_value = 0 act = np.zeros([1, self.action_size]) #action_idx = np.random.choice(self.action_size, 1, p=action)[0] act[0][action_idx] = 1 # 벨만 기대 방정식를 이용한 어드벤티지와 업데이트 타깃 if done: advantage = reward - value target = [reward] else: advantage = (reward + self.discount_factor * next_value) - value target = [reward + self.discount_factor * next_value] state_list = [state] advantage_list = [advantage] self.policy_updater([state_list, act, advantage_list]) self.value_updater([state_list, target]) def reset(self): self.sample = None self.training_data_idx = -1 + self.num_past_input def trade(self, model_path=None, balance=2000000): if model_path is None: return self.policy_network_obj.model = load_model(model_path) self.fit(balance=balance, num_epoches=1, learning=False) def fit(self, num_epoches=1000, balance=10000000, learning=True): logger.info("LR: {lr}, DF: {discount_factor}, " "TU: all-in, " "DRT: only-use immediate reward".format( lr=self.policy_network_obj.lr, discount_factor=self.discount_factor)) # 에이전트 초기 자본금 설정 self.agent.set_balance(balance) # 학습에 대한 정보 초기화 max_portfolio_value = 0 epoch_win_cnt = 0 portfolio_repeat_cnt = 0 exploration = False episode_results, episodes = [], [] pylab.clf() # 학습 반복 for epoch in range(num_epoches): start_time = time.time() # 에포크 관련 정보 초기화 previous_portfolio_value = self.agent.portfolio_value loss = 0. itr_cnt = 0 win_cnt = 0 exploration_cnt = 0 batch_size = 0 pos_learning_cnt = 0 neg_learning_cnt = 0 # 환경, 에이전트, 정책 신경망 초기화 self.environment.reset(self.num_past_input) self.agent.reset() self.policy_network_obj.reset() self.value_network_obj.reset() self.reset() self.environment.observe() self.training_data_idx += 1 self.state = [] for i in range(self.num_past_input + 1): self.state.extend( self.training_data.iloc[self.training_data_idx - i].tolist()) #self.state.extend(self.agent.get_states()) done = False # 학습을 진행할 수록 탐험 비율 감소 if learning: epsilon = self.start_epsilon * (1. - float(epoch) / (num_epoches - 1)) else: epsilon = 0 #학습 시작 while True: # 정책 신경망에 의한 행동 결정 self.action = self.agent.decide_action(self.policy_network_obj, self.state) # 결정한 행동을 수행하고 즉시 보상 획득 immediate_reward, exploration, action_idx = self.agent.act( self.action, epsilon) if exploration: self.action[self.agent.rand_action] = self.agent.confidence for i in range(self.action_size): if i != self.agent.rand_action: self.action[i] = 1 - self.agent.confidence # 비학습 모드일 경우 if learning is False: print(self.environment.chart_data.iloc[ self.environment.idx]['date']) print(self.action) # 반복에 대한 정보 갱신 itr_cnt += 1 win_cnt += 1 if immediate_reward > 0 else 0 exploration_cnt += 1 if exploration is True else 0 # next state data 생성 state = self.state #현재 상태인 self.state를 state에 저장 action = self.action observation = self.environment.observe() if observation is not None: self.training_data_idx += 1 self.state = [] for i in range(self.num_past_input + 1): self.state.extend( self.training_data.iloc[self.training_data_idx - i].tolist()) #self.state = self.training_data.iloc[self.training_data_idx].tolist() #self.state.extend(self.agent.get_states()) next_state = self.state else: break # 학습중이고 랜덤탐헝이 아닌 경우 if learning and (exploration is False): if immediate_reward > 0: pos_learning_cnt += 1 else: neg_learning_cnt += 1 # 정책 신경망 갱신 self.train_model(state, action, action_idx, immediate_reward, next_state, done) # 에포크 관련 정보 가시화 print("epoch:", epoch + 1, " / sequence:", itr_cnt, " / portfolio_value:", self.agent.portfolio_value) epoch_time = time.time() - start_time remain_time = epoch_time * (num_epoches - (epoch + 1)) print("epoch_time: %s second" % (round(epoch_time, 2)), " / remain_time: %s hour" % (round(remain_time / 3600, 2))) if (epoch_time > 1): #한 epoch 당 1초가 넘을때만 plot을 그린다 episode_results.append(self.agent.portfolio_value) episodes.append(epoch + 1) pylab.plot(episodes, episode_results, 'b') if not os.path.isdir("./save_graph"): os.makedirs("./save_graph") pylab.savefig("./save_graph/result.png") # 학습 관련 정보 갱신 max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # 학습을 일찍 끝낼지 여부 결정 #if previous_portfolio_value == self.agent.portfolio_value: # portfolio_repeat_cnt += 1 #else: # portfolio_repeat_cnt = 0 #if portfolio_repeat_cnt == 10: # break # 학습 관련 정보 로그 기록 logger.info("Max PV: %s, \t # Win: %d" % (locale.currency( max_portfolio_value, grouping=True), epoch_win_cnt))
#...... Hyperparameters BATCHSIZE = 16 TRAJCECTORIES = 100 lr = 1e-3 EPOCHS = 3000 HORIZON = 100 DECAY = 0 PRECISION = 1e-9 MAXITERS = 1000 DEVICE = 'cpu' #....... Initlialize an empty net neural_net = ValueNetwork(input_dims=3, fc1_dims=18, fc2_dims=10,fc3_dims=1, activation=nn.Tanh(), device = DEVICE) #...... Generate Dataset starting_points = Datagen.griddedData(n_points=TRAJCECTORIES) x_train, y_train = [], [] for starting_point in starting_points: model = crocoddyl.ActionModelUnicycle() model.costWeights = np.array([1.,1.]).T problem = crocoddyl.ShootingProblem(starting_point.T, [model]*HORIZON, model) ddp = crocoddyl.SolverDDP(problem) ddp.th_stop = PRECISION ddp.solve([], [], MAXITERS) xs = np.array(ddp.xs).tolist() for node in xs: x_train.append(node)
action_list = [0, 1, 2, 3] maze = env.simple_maze(dims, start, goal, action_list) state_dim = len(maze.state_dim) action_dim = 1 action_size = len(maze.action_space) BATCH_SIZE = 3 TAU = 0.001 LRA = 0.0001 LRC = 0.001 gamma = 0.9 actor = PolicyNetwork(3, 6) critic = ValueNetwork(3, 6) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for episode in range(100): R = 0 state = maze.start maze.state = state # get critic output and sample action state, _, _ = state_action_processing(state) scores = actor.predict(state)[0] print scores action = np.where(np.random.multinomial(1, scores))[0][0]