示例#1
0
class Agent(object):
    def __init__(self):
        self.epsilon = EPSILON_START
        self.value_network = ValueNetwork()

    def train(self, current_state, reward, next_state):
        self.value_network.train(current_state, reward, next_state)

    def get_reward(self, state):
        return self.value_network.get_reward(state)

    def get_next_state(self, current_state):
        if random.random() < self.epsilon:
            # explore
            print('RANDOM STATE ',
                  random.choice(CURRENT_STATE_TO_NEXT_STATE[current_state]))
            return random.choice(CURRENT_STATE_TO_NEXT_STATE[current_state])
        else:
            # exploit
            print('EXploit Action ', )
            return self.value_network.get_best_next_state(current_state)

    def get_states_value(self):
        return self.value_network.state_value

    def update_epsilon(self):
        if self.epsilon - EPSILON_CHANGE_RATE > MINIMUM_EPSILON:
            self.epsilon -= EPSILON_CHANGE_RATE
        else:
            self.epsilon = MINIMUM_EPSILON
示例#2
0
 def _start(self):
     logger.info('start...')
     np.random.seed(os.getpid())
     logger.info('random:%s', [np.random.random() for _ in range(3)])
     value_model = ValueNetwork(hidden_activation='selu',
                                output_activation='sigmoid')
     for i in range(self.begin, 2**32):
         logger.info('simulate %s', i)
         self.episode = i
         self.decay_epsilon()
         board = rule.random_init_board(
         ) if self.init_board == 'random' else rule.init_board()
         player = 1
         # value_model_params = self.model_queue.get()
         with self.weight_lock:
             value_model.model.load_weights(self.weights_file)
         ts = MCTS(init_board=board.copy(),
                   player=player,
                   policy_model=None,
                   value_model=value_model,
                   max_search=50,
                   min_search_time=0,
                   scene=Scene.TRAIN)
         ts.decay_t(episode=i)
         records, winner = self.simulate(ts, board, player)
         if records.length() == 0:
             continue
         self.record_queue.put(records)
         if i % 100 == 0:
             records.save('records/train/alpha0/1st_%03d_' % (i // 100))
示例#3
0
    def expansion(self):
        board, player = self.board, self.player
        actions = rule.valid_actions(board, player)
        # actions_ = list(filter(lambda a:(self.board_str, *a) not in walked, actions))
        # if len(actions_) == 0:
        # 全部已经走过,重新选
        # actions_ = actions
        if self.player == self.tree.player:
            with self.tree.value_model_lock:
                values = [
                    self.tree.value_model.q(board, from_, act)
                    for from_, act in actions
                ]
        else:
            with self.tree.opp_value_model_lock:
                values = [
                    self.tree.opp_value_model.q(board, from_, act)
                    for from_, act in actions
                ]

        probs = ValueNetwork.value_to_probs(values)
        for a, v, p in zip(actions, values, probs):
            e = Edge(upper_node=self, a=a, v=v, p=p, lambda_=self.tree.lambda_)
            self.add_edge(e)
        self.expanded = True
    def __init__(self,
                 stock_code,
                 chart_data,
                 training_data=None,
                 policy_model_path=None,
                 value_model_path=None,
                 lr=0.001,
                 discount_factor=0.5,
                 start_epsilon=0,
                 num_past_input=0,
                 load_weight_and_learn=False):
        self.stock_code = stock_code  # 종목코드
        self.chart_data = chart_data
        self.environment = Environment(chart_data)  # 환경 객체
        # 에이전트 객체
        self.agent = Agent(self.environment)
        self.training_data = training_data  # 학습 데이터
        self.training_data_idx = -1
        self.state = None
        self.action_size = self.agent.NUM_ACTIONS
        self.discount_factor = discount_factor
        self.start_epsilon = start_epsilon
        self.num_past_input = num_past_input
        self.load_weight_and_learn = load_weight_and_learn

        # 정책/가치 신경망; 입력 크기 = 학습 데이터의 크기 #+ 에이전트 상태 크기
        self.num_features = self.training_data.shape[1] * (
            1 + num_past_input)  #+ self.agent.STATE_DIM
        self.policy_network_obj = PolicyNetwork(
            input_dim=self.num_features,
            output_dim=self.agent.NUM_ACTIONS,
            lr=lr)
        if load_weight_and_learn is True:
            self.policy_network_obj.model = load_model(policy_model_path)
            self.policy_network = self.policy_network_obj.model
        else:
            self.policy_network = self.policy_network_obj.make_model()
        self.value_network_obj = ValueNetwork(input_dim=self.num_features,
                                              lr=lr)
        if load_weight_and_learn is True:
            self.value_network_obj.model = load_model(value_model_path)
            self.value_network = self.value_network_obj.model
        else:
            self.value_network = self.value_network_obj.make_model()
        self.policy_updater = self.policy_optimizer()
        self.value_updater = self.value_optimizer()
    def __init__(self, alpha, input_size, output_size):

        self.buffer = Buffer()

        self.value_network = ValueNetwork(alpha,
                                          input_size=input_size,
                                          output_size=1)

        self.policy_network = PolicyNetwork(0.0001,
                                            input_size=input_size,
                                            output_size=output_size)

        self.old_policy_network = PolicyNetwork(0.0001,
                                                input_size=input_size,
                                                output_size=output_size)

        # store policy state
        self.buffer.store_parameters(self.policy_network.state_dict())

        self.avg_rewards = []
示例#6
0
    def __init__(self,
                 player,
                 init_board,
                 first_player,
                 policy_model,
                 value_model,
                 expansion_gate=50,
                 lambda_=0.5,
                 min_search=500,
                 max_search=10000,
                 min_search_time=15,
                 max_search_time=300,
                 scene=Scene.PLAY,
                 t=100,
                 t_decay=0.0002):
        self.player = player
        self.expansion_gate = expansion_gate
        self.lambda_ = lambda_
        self.min_search = min_search  # 最小的搜索次数
        self.max_search = max_search  # 最大的搜索次数(超过即停止搜索)
        self.max_search_time = max_search_time  # 最大搜索时间(s)(超过即停止搜索)
        self.min_search_time = min_search_time  # 最小搜索时间(s)(同时满足最小搜索次数和最小搜索时间时,停止搜索)
        self.searching = False
        self.stop = False
        self.stop_event = Event()
        self.depth = 0
        self.n_node = 0
        self.n_search = 0
        self.scene = scene
        self.t = t  # 训练的温度,逐渐减小,用以控制概率
        self._t = t
        self.t_decay = t_decay

        logger.info('value_model: %s', value_model)
        if isinstance(value_model, str):
            value_model = DQN(
                hidden_activation='relu', lr=0.001, weights_file=value_model
            ) if 'DQN' in value_model else ValueNetwork(
                hidden_activation='relu', lr=0.001, weights_file=value_model)
        # self.policy = value_model
        logger.info('value_model: %s', value_model)
        self.value_model = value_model
        self.value_model_lock = threading.Lock()
        # 模拟对手走棋时用的Q值由opp_value预测,走棋的同时学习对手走棋的习惯
        self.opp_value_model = type(value_model)(hidden_activation='relu',
                                                 lr=0.001)
        self.opp_value_model.copy(value_model)
        logger.info('opp_value_model: %s', self.opp_value_model)
        self.opp_value_model_lock = threading.Lock()
        self.root = Node(init_board, first_player, tree=self)
        self.predicted = set()  # 树中已经走过的走法 (board_str, from, act)
        self.root.expansion()
        self.opp_actions = {}  # 对手走过的棋局, {board:{action:n}]
示例#7
0
def train():
    from multiprocessing import Queue, Lock
    import os
    record_queue = Queue()
    model_queue = Queue()
    weight_lock = Lock()
    weights_file = 'model/alpha0/weights'
    model_file = 'model/alpha0/value_network_00067h.model'
    begin = 6700
    _value_model = ValueNetwork(hidden_activation='selu',
                                output_activation='sigmoid',
                                model_file=model_file)
    value_model = ValueNetwork(hidden_activation='selu',
                               output_activation='sigmoid')
    value_model.copy(_value_model)

    if os.path.exists(weights_file):
        value_model.model.load_weights(weights_file)
    else:
        value_model.model.save_weights(weights_file)

    for _ in range(3):
        SimulateProcess(record_queue,
                        model_queue,
                        weight_lock,
                        weights_file,
                        init_board='random',
                        epsilon=1.0,
                        epsilon_decay=2e-3,
                        begin=begin // 3).start()

    for i in range(begin + 1, 2**32):
        records = record_queue.get()
        logger.info('train %s, records:%s', i, len(records))
        value_model.train(records, epochs=5)
        with weight_lock:
            value_model.model.save_weights(weights_file)
        if i % 100 == 0:
            value_model.save_model('model/alpha0/value_network_%05dh.model' %
                                   (i // 100))
示例#8
0
 def load_model(self):
     return ValueNetwork(hidden_activation=self.hidden_activation,
                         output_activation='sigmoid',
                         model_file=self.model_file,
                         weights_file=self.weights_file)
示例#9
0
 def __init__(self):
     self.epsilon = EPSILON_START
     self.value_network = ValueNetwork()
示例#10
0
class TRPO(object):
    def __init__(self, alpha, input_size, output_size):

        self.buffer = Buffer()

        self.value_network = ValueNetwork(alpha,
                                          input_size=input_size,
                                          output_size=1)

        self.policy_network = PolicyNetwork(0.0001,
                                            input_size=input_size,
                                            output_size=output_size)

        self.old_policy_network = PolicyNetwork(0.0001,
                                                input_size=input_size,
                                                output_size=output_size)

        # store policy state
        self.buffer.store_parameters(self.policy_network.state_dict())

        self.avg_rewards = []

    def update(self, iter=80):

        observations = self.buffer.get_observations()

        #actions = self.buffer.get_actions()
        rewards = self.buffer.get_rewards()
        advantages = self.buffer.get_advantages()
        log_probs = self.buffer.get_log_probs()

        self.old_policy_network.load_state_dict(self.buffer.old_parameters)

        old_pred = self.old_policy_network.forward(observations)
        old_action_probabilities = torch.distributions.Categorical(old_pred)
        old_action = old_action_probabilities.sample()
        old_probs = old_action_probabilities.log_prob(old_action).reshape(
            -1, 1)
        self.buffer.store_parameters(self.policy_network.state_dict())

        self.policy_network.optimize(log_probs, old_probs, advantages)

        self.value_network.optimize(observations, rewards, iter=iter)

    def calculate_advantage(self):

        prev_observation = self.buffer.observation_buffer[-2]
        observation = self.buffer.observation_buffer[-1]

        v1 = self.value_network(prev_observation)
        v2 = self.value_network(observation)

        return self.buffer.reward_buffer[-1] + v2 - v1

    def act(self, observation):
        prediction = self.policy_network.forward(observation)
        action_probabilities = torch.distributions.Categorical(prediction)
        action = action_probabilities.sample()
        log_prob = action_probabilities.log_prob(action)
        self.buffer.store_log_prob(log_prob)
        return action.item(), log_prob

    def discount_rewards(self, step):
        for s in reversed(range(1, step + 1)):
            update = 0
            for k in reversed(range(1, s + 1)):
                update += self.buffer.reward_buffer[-k] * (0.99**k)
            self.buffer.reward_buffer[-s] += update

    def train(self, env, epochs=1000, steps=4000):

        plt.ion()

        for epoch in range(epochs):

            observation = env.reset()
            self.buffer.store_observation(observation)

            step = 0

            for step in range(steps):

                step += 1

                action, log_prob = self.act(observation)
                self.buffer.store_action(log_prob)

                observation, reward, done, info = env.step(action)

                self.buffer.store_reward(reward / 200 + observation[0] / 2 +
                                         (1 * observation[1])**2)

                #env.render()
                self.buffer.store_observation(observation)
                advantage = self.calculate_advantage()
                self.buffer.store_advantage(advantage)

                if done or step == steps - 1:
                    observation = env.reset()
                    self.discount_rewards(step)
                    step = 0

            self.update(iter=5)
            rwrd = self.buffer.get_rewards()
            self.avg_rewards.append((torch.sum(rwrd) / rwrd.shape[0]).numpy())
            self.buffer.clear_buffer()
            print("Average Reward: {}".format(self.avg_rewards[-1]))

            plt.title("Reward per Epoch")
            plt.xlabel("Epoch")
            plt.ylabel("Reward")
            plt.plot(self.avg_rewards, label="average reward")
            plt.legend(loc="upper left")
            plt.draw()
            plt.pause(0.0001)
            plt.clf()
class PolicyLearner:
    def __init__(self,
                 stock_code,
                 chart_data,
                 training_data=None,
                 policy_model_path=None,
                 value_model_path=None,
                 lr=0.001,
                 discount_factor=0.5,
                 start_epsilon=0,
                 num_past_input=0,
                 load_weight_and_learn=False):
        self.stock_code = stock_code  # 종목코드
        self.chart_data = chart_data
        self.environment = Environment(chart_data)  # 환경 객체
        # 에이전트 객체
        self.agent = Agent(self.environment)
        self.training_data = training_data  # 학습 데이터
        self.training_data_idx = -1
        self.state = None
        self.action_size = self.agent.NUM_ACTIONS
        self.discount_factor = discount_factor
        self.start_epsilon = start_epsilon
        self.num_past_input = num_past_input
        self.load_weight_and_learn = load_weight_and_learn

        # 정책/가치 신경망; 입력 크기 = 학습 데이터의 크기 #+ 에이전트 상태 크기
        self.num_features = self.training_data.shape[1] * (
            1 + num_past_input)  #+ self.agent.STATE_DIM
        self.policy_network_obj = PolicyNetwork(
            input_dim=self.num_features,
            output_dim=self.agent.NUM_ACTIONS,
            lr=lr)
        if load_weight_and_learn is True:
            self.policy_network_obj.model = load_model(policy_model_path)
            self.policy_network = self.policy_network_obj.model
        else:
            self.policy_network = self.policy_network_obj.make_model()
        self.value_network_obj = ValueNetwork(input_dim=self.num_features,
                                              lr=lr)
        if load_weight_and_learn is True:
            self.value_network_obj.model = load_model(value_model_path)
            self.value_network = self.value_network_obj.model
        else:
            self.value_network = self.value_network_obj.make_model()
        self.policy_updater = self.policy_optimizer()
        self.value_updater = self.value_optimizer()

    # 정책신경망을 업데이트하는 함수
    def policy_optimizer(self):
        action = K.placeholder(shape=[None, self.action_size])
        advantage = K.placeholder(shape=[
            None,
        ])

        action_prob = K.sum(action * self.policy_network.output, axis=1)
        cross_entropy = K.log(action_prob) * advantage
        loss = -K.sum(cross_entropy)

        optimizer = Nadam(lr=self.policy_network_obj.lr)
        updates = optimizer.get_updates(self.policy_network.trainable_weights,
                                        [], loss)
        train = K.function([self.policy_network.input, action, advantage], [],
                           updates=updates)
        return train

    # 가치신경망을 업데이트하는 함수
    def value_optimizer(self):
        target = K.placeholder(shape=[
            None,
        ])

        loss = K.mean(K.square(target - self.value_network.output))

        optimizer = Nadam(lr=self.value_network_obj.lr)
        updates = optimizer.get_updates(self.value_network.trainable_weights,
                                        [], loss)
        train = K.function([self.value_network.input, target], [],
                           updates=updates)

        return train

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, state, action, action_idx, reward, next_state, done):
        value = self.value_network_obj.predict(state)[0]
        #next_value = self.value_network_obj.predict(next_state)[0]
        next_value = 0

        act = np.zeros([1, self.action_size])
        #action_idx = np.random.choice(self.action_size, 1, p=action)[0]
        act[0][action_idx] = 1

        # 벨만 기대 방정식를 이용한 어드벤티지와 업데이트 타깃
        if done:
            advantage = reward - value
            target = [reward]
        else:
            advantage = (reward + self.discount_factor * next_value) - value
            target = [reward + self.discount_factor * next_value]

        state_list = [state]
        advantage_list = [advantage]
        self.policy_updater([state_list, act, advantage_list])
        self.value_updater([state_list, target])

    def reset(self):
        self.sample = None
        self.training_data_idx = -1 + self.num_past_input

    def trade(self, model_path=None, balance=2000000):
        if model_path is None:
            return
        self.policy_network_obj.model = load_model(model_path)
        self.fit(balance=balance, num_epoches=1, learning=False)

    def fit(self, num_epoches=1000, balance=10000000, learning=True):
        logger.info("LR: {lr}, DF: {discount_factor}, "
                    "TU: all-in, "
                    "DRT: only-use immediate reward".format(
                        lr=self.policy_network_obj.lr,
                        discount_factor=self.discount_factor))

        # 에이전트 초기 자본금 설정
        self.agent.set_balance(balance)

        # 학습에 대한 정보 초기화
        max_portfolio_value = 0
        epoch_win_cnt = 0
        portfolio_repeat_cnt = 0
        exploration = False
        episode_results, episodes = [], []
        pylab.clf()
        # 학습 반복
        for epoch in range(num_epoches):
            start_time = time.time()
            # 에포크 관련 정보 초기화
            previous_portfolio_value = self.agent.portfolio_value
            loss = 0.
            itr_cnt = 0
            win_cnt = 0
            exploration_cnt = 0
            batch_size = 0
            pos_learning_cnt = 0
            neg_learning_cnt = 0

            # 환경, 에이전트, 정책 신경망 초기화
            self.environment.reset(self.num_past_input)
            self.agent.reset()
            self.policy_network_obj.reset()
            self.value_network_obj.reset()
            self.reset()

            self.environment.observe()
            self.training_data_idx += 1
            self.state = []
            for i in range(self.num_past_input + 1):
                self.state.extend(
                    self.training_data.iloc[self.training_data_idx -
                                            i].tolist())
            #self.state.extend(self.agent.get_states())
            done = False
            # 학습을 진행할 수록 탐험 비율 감소
            if learning:
                epsilon = self.start_epsilon * (1. - float(epoch) /
                                                (num_epoches - 1))
            else:
                epsilon = 0
            #학습 시작
            while True:
                # 정책 신경망에 의한 행동 결정
                self.action = self.agent.decide_action(self.policy_network_obj,
                                                       self.state)
                # 결정한 행동을 수행하고 즉시 보상 획득
                immediate_reward, exploration, action_idx = self.agent.act(
                    self.action, epsilon)
                if exploration:
                    self.action[self.agent.rand_action] = self.agent.confidence
                    for i in range(self.action_size):
                        if i != self.agent.rand_action:
                            self.action[i] = 1 - self.agent.confidence

                # 비학습 모드일 경우
                if learning is False:
                    print(self.environment.chart_data.iloc[
                        self.environment.idx]['date'])
                    print(self.action)

                # 반복에 대한 정보 갱신
                itr_cnt += 1
                win_cnt += 1 if immediate_reward > 0 else 0
                exploration_cnt += 1 if exploration is True else 0

                # next state data 생성
                state = self.state  #현재 상태인 self.state를 state에 저장
                action = self.action
                observation = self.environment.observe()
                if observation is not None:
                    self.training_data_idx += 1
                    self.state = []
                    for i in range(self.num_past_input + 1):
                        self.state.extend(
                            self.training_data.iloc[self.training_data_idx -
                                                    i].tolist())
                    #self.state = self.training_data.iloc[self.training_data_idx].tolist()
                    #self.state.extend(self.agent.get_states())
                    next_state = self.state
                else:
                    break
                # 학습중이고 랜덤탐헝이 아닌 경우
                if learning and (exploration is False):
                    if immediate_reward > 0:
                        pos_learning_cnt += 1
                    else:
                        neg_learning_cnt += 1
                    # 정책 신경망 갱신
                    self.train_model(state, action, action_idx,
                                     immediate_reward, next_state, done)

            # 에포크 관련 정보 가시화
            print("epoch:", epoch + 1, " / sequence:", itr_cnt,
                  " / portfolio_value:", self.agent.portfolio_value)
            epoch_time = time.time() - start_time
            remain_time = epoch_time * (num_epoches - (epoch + 1))
            print("epoch_time: %s second" % (round(epoch_time, 2)),
                  " / remain_time: %s hour" % (round(remain_time / 3600, 2)))
            if (epoch_time > 1):  #한 epoch 당 1초가 넘을때만 plot을 그린다
                episode_results.append(self.agent.portfolio_value)
                episodes.append(epoch + 1)
                pylab.plot(episodes, episode_results, 'b')
                if not os.path.isdir("./save_graph"):
                    os.makedirs("./save_graph")
                pylab.savefig("./save_graph/result.png")

            # 학습 관련 정보 갱신
            max_portfolio_value = max(max_portfolio_value,
                                      self.agent.portfolio_value)
            if self.agent.portfolio_value > self.agent.initial_balance:
                epoch_win_cnt += 1

            # 학습을 일찍 끝낼지 여부 결정
            #if previous_portfolio_value == self.agent.portfolio_value:
            #    portfolio_repeat_cnt += 1
            #else:
            #    portfolio_repeat_cnt = 0
            #if portfolio_repeat_cnt == 10:
            #    break

        # 학습 관련 정보 로그 기록
        logger.info("Max PV: %s, \t # Win: %d" % (locale.currency(
            max_portfolio_value, grouping=True), epoch_win_cnt))
#...... Hyperparameters

BATCHSIZE        = 16
TRAJCECTORIES    = 100
lr               = 1e-3
EPOCHS           = 3000
HORIZON          = 100
DECAY            = 0
PRECISION        = 1e-9
MAXITERS         = 1000
DEVICE           = 'cpu'


#....... Initlialize an empty net

neural_net = ValueNetwork(input_dims=3, fc1_dims=18, fc2_dims=10,fc3_dims=1, activation=nn.Tanh(), device = DEVICE)

#...... Generate Dataset
starting_points         = Datagen.griddedData(n_points=TRAJCECTORIES)

x_train, y_train         = [], []
for starting_point in starting_points:
    model               = crocoddyl.ActionModelUnicycle()
    model.costWeights   = np.array([1.,1.]).T
    problem             = crocoddyl.ShootingProblem(starting_point.T, [model]*HORIZON, model)
    ddp                 = crocoddyl.SolverDDP(problem)
    ddp.th_stop         = PRECISION
    ddp.solve([], [], MAXITERS)
    xs = np.array(ddp.xs).tolist()
    for node in xs:
        x_train.append(node)
示例#13
0
action_list = [0, 1, 2, 3]

maze = env.simple_maze(dims, start, goal, action_list)

state_dim = len(maze.state_dim)
action_dim = 1
action_size = len(maze.action_space)

BATCH_SIZE = 3
TAU = 0.001
LRA = 0.0001
LRC = 0.001
gamma = 0.9

actor = PolicyNetwork(3, 6)
critic = ValueNetwork(3, 6)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for episode in range(100):
        R = 0
        state = maze.start
        maze.state = state

        # get critic output and sample action
        state, _, _ = state_action_processing(state)
        scores = actor.predict(state)[0]
        print scores

        action = np.where(np.random.multinomial(1, scores))[0][0]