示例#1
0
 def init_value_network(self,
                        shared_network=None,
                        activation='linear',
                        loss='mse'):
     if self.net == 'dnn':
         self.value_network = DNN(input_dim=self.num_features,
                                  output_dim=self.agent.NUM_ACTIONS,
                                  lr=self.lr,
                                  shared_network=shared_network,
                                  activation=activation,
                                  loss=loss)
     elif self.net == 'lstm':
         self.value_network = LSTMNetwork(input_dim=self.num_features,
                                          output_dim=self.agent.NUM_ACTIONS,
                                          lr=self.lr,
                                          num_steps=self.num_steps,
                                          shared_network=shared_network,
                                          activation=activation,
                                          loss=loss)
     elif self.net == 'cnn':
         self.value_network = CNN(input_dim=self.num_features,
                                  output_dim=self.agent.NUM_ACTIONS,
                                  lr=self.lr,
                                  num_steps=self.num_steps,
                                  shared_network=shared_network,
                                  activation=activation,
                                  loss=loss)
     if self.reuse_models and \
         os.path.exists(self.value_network_path): # reuse_models이 True이고, value_network_path 값이 있으면 신경망 모델 파일을 불러온다...
         self.value_network.load_model(model_path=self.value_network_path)
示例#2
0
文件: viar.py 项目: GedamuA/VIAR
def build_models(args, device='cuda'):
    models = {}

    models['encodercnn'] = CNN(input_shape=RGB_INPUT_SHAPE,
                               model_name=args.encoder_cnn_model).to(device)

    models['encoder'] = Encoder(input_shape=models['encodercnn'].out_size,
                                encoder_block='convbilstm',
                                hidden_size=args.encoder_hid_size).to(device)

    models['crossviewdecodercnn'] = CNN(input_shape=DEPTH_INPUT_SHAPE,
                                        model_name=args.encoder_cnn_model,
                                        input_channel=1).to(device)

    crossviewdecoder_in_size = list(models['crossviewdecodercnn'].out_size)
    crossviewdecoder_in_size[0] = crossviewdecoder_in_size[0] * 3
    crossviewdecoder_in_size = torch.Size(crossviewdecoder_in_size)
    models['crossviewdecoder'] = CrossViewDecoder(
        input_shape=crossviewdecoder_in_size).to(device)

    models['reconstructiondecoder'] = ReconstructionDecoder(
        input_shape=models['encoder'].out_size[1:]).to(device)

    models['viewclassifier'] = ViewClassifier(
        input_size=reduce(operator.mul, models['encoder'].out_size[1:]),
        num_classes=5,
        reverse=(not args.disable_grl)).to(device)

    return models
示例#3
0
 def init_policy_network(self,
                         shared_network=None,
                         activation='sigmoid',
                         loss='binary_crossentropy'):
     if self.net == 'dnn':
         self.policy_network = DNN(input_dim=self.num_features,
                                   output_dim=self.agent.NUM_ACTIONS,
                                   lr=self.lr,
                                   shared_network=shared_network,
                                   activation=activation,
                                   loss=loss)
     elif self.net == 'lstm':
         self.policy_network = LSTMNetwork(
             input_dim=self.num_features,
             output_dim=self.agent.NUM_ACTIONS,
             lr=self.lr,
             num_steps=self.num_steps,
             shared_network=shared_network,
             activation=activation,
             loss=loss)
     elif self.net == 'cnn':
         self.policy_network = CNN(input_dim=self.num_features,
                                   output_dim=self.agent.NUM_ACTIONS,
                                   lr=self.lr,
                                   num_steps=self.num_steps,
                                   shared_network=shared_network,
                                   activation=activation,
                                   loss=loss)
     if self.reuse_models and \
         os.path.exists(self.policy_network_path):
         self.policy_network.load_model(model_path=self.policy_network_path)
示例#4
0
    def __init__(self,
                 n_actions,
                 input_shape,
                 save_path=None,
                 load_path=None,
                 action_inverses={},
                 update_interval=256 * 2,
                 save_interval=5):
        self.n_actions = n_actions
        self.save_path = save_path
        self.network = CNN(n_out=n_actions, input_shape=input_shape)

        if load_path != None:
            self.network.load(load_path)

        self.n_features = input_shape
        self.data = []
        self.current_episode_count = 1

        self.random_actions = 0

        self.last_action = None
        self.last_action_random = False

        self.action_inverses = action_inverses

        self.lifetime = 1
        self.update_interval = update_interval
        self.save_interval = save_interval
        self.n_updates = 1
示例#5
0
    def do_stuff(opt):
        print(f'\nTraining {opt} for {args.num_epochs} epochs...')
        net = CNN() if args.dataset == 'cifar' else MLP()
        _, kwargs = misc.split_optim_dict(misc.optim_dict[opt])
        optimizer = misc.task_to_optimizer(opt)(params=net.parameters(),
                                                **kwargs)
        optimizer = misc.wrap_optimizer(opt, optimizer)

        return fit(net,
                   data,
                   optimizer,
                   num_epochs=args.num_epochs,
                   lr_schedule=True)
示例#6
0
    def do_stuff(opt):
        print(f'\nTraining {opt} for {args.num_epochs} epochs...')
        net = CNN() if args.dataset == 'cifar' else MLP()
        _, kwargs = misc.split_optim_dict(misc.optim_dict[opt])
        optimizer = misc.task_to_optimizer(opt)(params=net.parameters(),
                                                **kwargs)

        if 'lookahead' in opt.lower():
            optimizer = optimizers.Lookahead(optimizer, k=5, alpha=0.5)

        return fit(net,
                   data,
                   optimizer,
                   num_epochs=args.num_epochs,
                   lr_schedule=True)
示例#7
0
 def define_agent(self, width, height, num_actions):
     return NStepDQNAgent(config=Config(
         num_actions=num_actions,
         encoder=LayerEncoder(width, height, treasure_position=True),
         optimizer=SharedAdamOptimizer(0.001),
         network=CNN(hidden_units=[128]),
         policy=EpsilonGreedyPolicy(1, 0.01, 25000),
         discount=0.95,
         n_step=16))
示例#8
0
 def define_agent(self, width, height, num_actions):
     return DQNAgent(
         config=Config(
             num_actions=num_actions,
             encoder=LayerEncoder(width, height, treasure_position=True),
             optimizer=AdamOptimizer(0.001),
             network=CNN(hidden_units=[128]),
             policy=EpsilonGreedyPolicy(1, 0.01, 50000),
             discount=0.95,
             capacity=10000,
             batch_size=8,
             target_sync=100,
             double_q=True))
示例#9
0
    def init_value_network(self,
                           shared_network=None,
                           activation='linear',
                           loss='mse'):
        if self.rl_method == 'ddpg':
            self.critic = CriticNetwork(input_dim=self.num_features,
                                        output_dim=self.agent.NUM_ACTIONS,
                                        num_steps=self.num_steps,
                                        activation=activation,
                                        loss=loss,
                                        lr=self.lr)
        elif self.net == 'dnn':
            self.value_network = DNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'lstm':
            self.value_network = LSTMNetwork(input_dim=self.num_features,
                                             output_dim=self.agent.NUM_ACTIONS,
                                             lr=self.lr,
                                             num_steps=self.num_steps,
                                             shared_network=shared_network,
                                             activation=activation,
                                             loss=loss)
        elif self.net == 'cnn':
            self.value_network = CNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     num_steps=self.num_steps,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'cnn':
            self.value_network = CNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     num_steps=self.num_steps,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)

        if self.reuse_models and \
                os.path.exists(self.value_network_path):
            self.value_network.load_model(model_path=self.value_network_path)
示例#10
0
                labels.append(label.item())
            p, r, f, _ = report(labels, preds2)
            ps_2 += [p]
            rs_2 += [r]
            fs_2 += [f]
            # '''
            print('1.5+ PRF on', name, ':')
            print('\tprecision:', p)
            print('\trecall:', r)
            print('\tf score:', f)
            # '''
    return ps_1, rs_1, fs_1, ps_2, rs_2, fs_2


if __name__ == '__main__':
    cnn = CNN('./cnn_config_1.5T_optimal.json', 0)
    #valid_optimal_accu = cnn.train()
    cnn.epoch=146
    cnn.model.load_state_dict(torch.load('{}CNN_{}.pth'.format(cnn.checkpoint_dir, cnn.epoch)))
    cnn.test()
    # valid = 89.02
    # test  = 84.15

    # gan = GAN('./gan_config_optimal.json', 0)
    #gan.train()
    #'''
    # gan.optimal_epoch=105
    # gan.netG.load_state_dict(torch.load('{}G_{}.pth'.format(gan.checkpoint_dir, gan.optimal_epoch)))
    # print(gan.valid_model_epoch())
    #gan.test(False)
    # gan.eval_iqa(zoom=False, metric='brisque')
示例#11
0
class ReinforcementLearner:
    __metaclass__ = abc.ABCMeta
    lock = threading.Lock()

    def __init__(self,
                 rl_method='rl',
                 stock_code=None,
                 chart_data=None,
                 training_data=None,
                 min_trading_unit=1,
                 max_trading_unit=2,
                 delayed_reward_threshold=.05,
                 net='dnn',
                 num_steps=1,
                 lr=0.001,
                 value_network=None,
                 policy_network=None,
                 output_path='',
                 reuse_models=True):
        # 인자 확인
        assert min_trading_unit > 0
        assert max_trading_unit > 0
        assert max_trading_unit >= min_trading_unit
        assert num_steps > 0
        assert lr > 0
        # 강화학습 기법 설정
        self.rl_method = rl_method
        # 환경 설정
        self.stock_code = stock_code
        self.chart_data = chart_data
        self.environment = Environment(chart_data)
        # 에이전트 설정
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold)
        # 학습 데이터
        self.training_data = training_data
        self.sample = None
        self.training_data_idx = -1
        # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기
        self.num_features = self.agent.STATE_DIM
        if self.training_data is not None:
            self.num_features += self.training_data.shape[1]
        # 신경망 설정
        self.net = net
        self.num_steps = num_steps
        self.lr = lr
        self.value_network = value_network
        self.policy_network = policy_network
        self.reuse_models = reuse_models
        self.critic = value_network
        self.actor = policy_network
        self.tau = 0.001
        # 가시화 모듈
        self.visualizer = Visualizer()
        # 메모리
        self.memory_sample = []
        self.memory_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_target_policy = []
        self.memory_target_value = []
        self.memory_target_action = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0
        # 로그 등 출력 경로
        self.output_path = output_path

    def init_policy_network(self,
                            shared_network=None,
                            activation='sigmoid',
                            loss='binary_crossentropy'):
        if self.rl_method == 'ddpg':
            print("actor")
            self.actor = ActorNetwork(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      num_steps=self.num_steps,
                                      activation=activation,
                                      loss=loss,
                                      lr=self.lr)
            print(self.actor)
        elif self.net == 'dnn':
            self.policy_network = DNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        elif self.net == 'lstm':
            self.policy_network = LSTMNetwork(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr,
                num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation,
                loss=loss)
        elif self.net == 'cnn':
            self.policy_network = CNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      num_steps=self.num_steps,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        elif self.net == 'cnn':
            self.policy_network = CNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      num_steps=self.num_steps,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        if self.reuse_models and \
                os.path.exists(self.policy_network_path):
            self.policy_network.load_model(model_path=self.policy_network_path)

    def init_value_network(self,
                           shared_network=None,
                           activation='linear',
                           loss='mse'):
        if self.rl_method == 'ddpg':
            self.critic = CriticNetwork(input_dim=self.num_features,
                                        output_dim=self.agent.NUM_ACTIONS,
                                        num_steps=self.num_steps,
                                        activation=activation,
                                        loss=loss,
                                        lr=self.lr)
        elif self.net == 'dnn':
            self.value_network = DNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'lstm':
            self.value_network = LSTMNetwork(input_dim=self.num_features,
                                             output_dim=self.agent.NUM_ACTIONS,
                                             lr=self.lr,
                                             num_steps=self.num_steps,
                                             shared_network=shared_network,
                                             activation=activation,
                                             loss=loss)
        elif self.net == 'cnn':
            self.value_network = CNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     num_steps=self.num_steps,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'cnn':
            self.value_network = CNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     num_steps=self.num_steps,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)

        if self.reuse_models and \
                os.path.exists(self.value_network_path):
            self.value_network.load_model(model_path=self.value_network_path)

    def reset(self):
        self.sample = None
        self.training_data_idx = -1
        # 환경 초기화
        self.environment.reset()
        # 에이전트 초기화
        self.agent.reset()
        # 가시화 초기화
        self.visualizer.clear([0, len(self.chart_data)])
        # 메모리 초기화
        self.memory_sample = []
        self.memory_action = []
        self.memory_target_policy = []
        self.memory_target_value = []
        self.memory_target_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보 초기화
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0

    def build_sample(self):
        self.environment.observe()
        if len(self.training_data) > self.training_data_idx + 1:
            self.training_data_idx += 1
            self.sample = self.training_data.iloc[
                self.training_data_idx].tolist()
            self.sample.extend(self.agent.get_states())
            return self.sample
        return None

    @abc.abstractmethod
    def get_batch(self, batch_size, delayed_reward, discount_factor):
        pass

    @abc.abstractmethod
    def train(self, batch_size, delayed_reward, discount_factor):
        pass

    def update_networks(self, batch_size, delayed_reward, discount_factor):
        # 배치 학습 데이터 생성
        x, y_value, y_policy = self.get_batch(batch_size, delayed_reward,
                                              discount_factor)
        if len(x) > 0:
            loss = 0
            if y_value is not None:
                # 가치 신경망 갱신
                loss += self.critic.train_on_batch(x, y_value)
                self.critic.transfer_weights()
            if y_policy is not None:
                # 정책 신경망 갱신
                loss += self.actor.train_on_batch(x, y_policy)
                self.actor.transfer_weights()
            return loss
        return None

    def fit(self, delayed_reward, discount_factor, full=False):
        batch_size = len(self.memory_reward) if full \
            else self.batch_size
        # 배치 학습 데이터 생성 및 신경망 갱신
        if batch_size > 0:
            _loss = self.update_networks(batch_size, delayed_reward,
                                         discount_factor)
            if _loss is not None:
                self.loss += abs(_loss)
                self.learning_cnt += 1
                self.memory_learning_idx.append(self.training_data_idx)
            self.batch_size = 0

    def visualize(self, epoch_str, num_epoches, epsilon):
        self.memory_action = [Agent.ACTION_HOLD] \
                             * (self.num_steps - 1) + self.memory_action
        self.memory_num_stocks = [0] * (self.num_steps - 1) \
                                 + self.memory_num_stocks
        if self.value_network is not None:
            self.memory_value = [np.array([np.nan] \
                                          * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                                + self.memory_value
        if self.policy_network is not None:
            self.memory_policy = [np.array([np.nan] \
                                           * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                                 + self.memory_policy
        self.memory_pv = [self.agent.initial_balance] \
                         * (self.num_steps - 1) + self.memory_pv
        self.visualizer.plot(
            epoch_str=epoch_str,
            num_epoches=num_epoches,
            epsilon=epsilon,
            action_list=Agent.ACTIONS,
            actions=self.memory_action,
            num_stocks=self.memory_num_stocks,
            outvals_value=self.memory_value,
            outvals_policy=self.memory_policy,
            exps=self.memory_exp_idx,
            learning_idxes=self.memory_learning_idx,
            initial_balance=self.agent.initial_balance,
            pvs=self.memory_pv,
        )
        self.visualizer.save(
            os.path.join(self.epoch_summary_dir,
                         'epoch_summary_{}.png'.format(epoch_str)))

    def run(self,
            num_epoches=100,
            balance=10000000,
            discount_factor=0.9,
            start_epsilon=0.5,
            learning=True):
        info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \
               "DF:{discount_factor} TU:[{min_trading_unit}," \
               "{max_trading_unit}] DRT:{delayed_reward_threshold}".format(
            code=self.stock_code, rl=self.rl_method, net=self.net,
            lr=self.lr, discount_factor=discount_factor,
            min_trading_unit=self.agent.min_trading_unit,
            max_trading_unit=self.agent.max_trading_unit,
            delayed_reward_threshold=self.agent.delayed_reward_threshold
        )
        with self.lock:
            logging.info(info)

        # 시작 시간
        time_start = time.time()

        # 가시화 준비
        # 차트 데이터는 변하지 않으므로 미리 가시화
        self.visualizer.prepare(self.environment.chart_data, info)

        # 가시화 결과 저장할 폴더 준비
        self.epoch_summary_dir = os.path.join(
            self.output_path, 'epoch_summary_{}'.format(self.stock_code))
        if not os.path.isdir(self.epoch_summary_dir):
            os.makedirs(self.epoch_summary_dir)
        else:
            for f in os.listdir(self.epoch_summary_dir):
                os.remove(os.path.join(self.epoch_summary_dir, f))

        # 에이전트 초기 자본금 설정
        self.agent.set_balance(balance)

        # 학습에 대한 정보
        max_portfolio_value = 0
        epoch_win_cnt = 0

        # 학습 반복
        for epoch in range(num_epoches):
            time_start_epoch = time.time()

            # step 샘플을 만들기 위한 큐
            q_sample = collections.deque(maxlen=self.num_steps)

            # 환경, 에이전트, 신경망, 가시화, 메모리 초기화
            self.reset()
            # 학습을 진행할 수록 탐험 비율 감소
            if learning:
                epsilon = start_epsilon \
                          * (1. - float(epoch) / (num_epoches - 1))
                self.agent.reset_exploration()
            else:
                epsilon = start_epsilon
            while True:
                # 샘플 생성
                next_sample = self.build_sample()
                if next_sample is None:
                    break

                # num_steps만큼 샘플 저장
                q_sample.append(next_sample)
                if len(q_sample) < self.num_steps:
                    continue

                # 가치, 정책 신경망 예측
                pred_value = None
                pred_policy = None
                pred_target_policy = None
                pred_target_value = None
                if self.critic is not None:
                    pred_value = self.critic.predict(list(q_sample))
                    pred_target_value = self.critic.target_predict(
                        list(q_sample))
                if self.actor is not None:
                    pred_policy = self.actor.predict(list(q_sample))
                    pred_target_policy = self.actor.target_predict(
                        list(q_sample))

                # 신경망 또는 탐험에 의한 행동 결정
                action, confidence, exploration = \
                    self.agent.decide_action(pred_value, pred_policy, epsilon)

                # target 값을 이용한 행동 결정
                target_action, target_confidence, target_exploration = \
                    self.agent.decide_action(pred_target_policy, pred_target_value, epsilon)

                #결정한 행동을 수행하고 즉시 보상과 지연 보상 획득
                immediate_reward, delayed_reward = \
                    self.agent.act(action, confidence)

                # 행동 및 행동에 대한 결과를 기억
                self.memory_sample.append(list(q_sample))
                self.memory_action.append(action)
                self.memory_reward.append(immediate_reward)
                self.memory_target_action.append(target_action)
                self.memory_target_policy.append(pred_target_policy)
                self.memory_target_value.append(pred_target_value)
                if self.value_network is not None:
                    self.memory_value.append(pred_value)
                if self.policy_network is not None:
                    self.memory_policy.append(pred_policy)
                self.memory_pv.append(self.agent.portfolio_value)
                self.memory_num_stocks.append(self.agent.num_stocks)
                if exploration:
                    self.memory_exp_idx.append(self.training_data_idx)

                # 반복에 대한 정보 갱신
                self.batch_size += 1
                self.itr_cnt += 1
                self.exploration_cnt += 1 if exploration else 0

                # 지연 보상 발생된 경우 미니 배치 학습
                if learning and (delayed_reward != 0):
                    self.fit(delayed_reward, discount_factor)
            # 에포크 종료 후 학습
            if learning:
                self.fit(self.agent.profitloss, discount_factor, full=True)
            # 에포크 관련 정보 로그 기록
            num_epoches_digit = len(str(num_epoches))
            epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0')
            time_end_epoch = time.time()
            elapsed_time_epoch = time_end_epoch - time_start_epoch
            if self.learning_cnt > 0:
                logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} "
                             "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} "
                             "#Stocks:{} PV:{:,.0f} "
                             "LC:{} Loss:{:.6f} ET:{:.4f}".format(
                                 self.stock_code, epoch_str, num_epoches,
                                 epsilon, self.exploration_cnt, self.itr_cnt,
                                 self.agent.num_buy, self.agent.num_sell,
                                 self.agent.num_hold, self.agent.num_stocks,
                                 self.agent.portfolio_value, self.learning_cnt,
                                 self.loss, elapsed_time_epoch))

            # 에포크 관련 정보 가시화
            self.visualize(epoch_str, num_epoches, epsilon)

            # 학습 관련 정보 갱신
            max_portfolio_value = max(max_portfolio_value,
                                      self.agent.portfolio_value)
            if self.agent.portfolio_value > self.agent.initial_balance:
                epoch_win_cnt += 1

        # 종료 시간
        time_end = time.time()
        elapsed_time = time_end - time_start

        # 학습 관련 정보 로그 기록
        with self.lock:
            logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} "
                         "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format(
                             code=self.stock_code,
                             elapsed_time=elapsed_time,
                             max_pv=max_portfolio_value,
                             cnt_win=epoch_win_cnt))

    def save_models(self):
        if self.value_network is not None and \
                self.value_network_path is not None:
            self.value_network.save_model(self.value_network_path)
        if self.policy_network is not None and \
                self.policy_network_path is not None:
            self.policy_network.save_model(self.policy_network_path)
def pre_train(dataloader,
              test_loader,
              dict_loader,
              dataloader_test,
              mask_labels,
              total_epochs=50,
              learning_rate=1e-4,
              use_gpu=True,
              seed=123):

    args = parser.parse_args()
    pprint(args)

    num_bits = args.num_bits

    model = CNN(model_name='alexnet', bit=num_bits, class_num=args.num_class)

    criterion = custom_loss(num_bits=num_bits)

    arch = 'cnn_'
    filename = arch + args.dataset + '_' + str(num_bits) + "bits"
    checkpoint_filename = os.path.join(args.checkpoint, filename + '.pt')

    if use_gpu:
        model = model.cuda()
        model = torch.nn.DataParallel(model,
                                      device_ids=range(
                                          torch.cuda.device_count()))
        criterion = criterion.cuda()
        torch.cuda.manual_seed(seed)

    running_loss = 0.0

    start_epoch = 0
    batch_time = AverageMeter()
    data_time = AverageMeter()
    end = time.time()

    best_prec = -99999

    k = 10500
    n_samples = 200000

    alpha = 0.4
    alpha_1 = 0.99

    mask_labels = torch.from_numpy(mask_labels).long().cuda()

    Z_h1 = torch.zeros(n_samples,
                       num_bits).float().cuda()  # intermediate values
    z_h1 = torch.zeros(n_samples, num_bits).float().cuda()  # temporal outputs
    h1 = torch.zeros(n_samples, num_bits).float().cuda()  # current outputs

    Z_h2 = torch.zeros(args.anchor_num,
                       num_bits).float().cuda()  # intermediate values
    z_h2 = torch.zeros(args.anchor_num,
                       num_bits).float().cuda()  # temporal outputs
    h2 = torch.zeros(args.anchor_num,
                     num_bits).float().cuda()  # current outputs

    for epoch in range(start_epoch, total_epochs):
        model.train(True)
        rampup_value = rampup(epoch)
        rampdown_value = rampdown(epoch)
        learning_rate = rampup_value * rampdown_value * 0.00005
        adam_beta1 = rampdown_value * 0.9 + (1.0 - rampdown_value) * 0.5
        adam_beta2 = step_rampup(epoch) * 0.99 + (1 -
                                                  step_rampup(epoch)) * 0.999

        if epoch == 0:
            u_w = 0.0
        else:
            u_w = rampup_value

        u_w_m = u_w * 5

        u_w_m = torch.autograd.Variable(torch.FloatTensor([u_w_m]).cuda(),
                                        requires_grad=False)

        optimizer = Adam(model.parameters(),
                         lr=learning_rate,
                         betas=(adam_beta1, adam_beta2),
                         eps=1e-8,
                         amsgrad=True)

        anchors_data, anchor_Label = generate_anchor_vectors(dict_loader)

        for iteration, data in enumerate(dataloader, 0):

            anchor_index = np.arange(args.anchor_num)
            np.random.shuffle(anchor_index)

            anchor_index = anchor_index[:100]

            anchor_index = torch.from_numpy(anchor_index).long().cuda()

            anchor_inputs = anchors_data[anchor_index, :, :, :]
            anchor_labels = anchor_Label[anchor_index, :]

            inputs, labels, index = data['image'], data['labels'], data[
                'index']

            labels = labels.float()

            mask_flag = Variable(mask_labels[index], requires_grad=False)
            idx = (mask_flag > 0)

            if index.shape[0] == args.batch_size:
                anchor_batch_S, anchor_batch_W = CalcSim(
                    labels[idx, :].cuda(), anchor_labels.cuda())

                if inputs.size(3) == 3:
                    inputs = inputs.permute(0, 3, 1, 2)
                inputs = inputs.type(torch.FloatTensor)

                zcomp_h1 = z_h1[index.cuda(), :]
                zcomp_h2 = z_h2[anchor_index, :]

                labeled_batch_S, labeled_batch_W = CalcSim(
                    labels[idx, :].cuda(), labels[idx, :].cuda())

                if use_gpu:
                    inputs = Variable(inputs.cuda(), requires_grad=False)
                    anchor_batch_S = Variable(anchor_batch_S.cuda(),
                                              requires_grad=False)
                    anchor_batch_W = Variable(anchor_batch_W.cuda(),
                                              requires_grad=False)
                    labeled_batch_S = Variable(labeled_batch_S.cuda(),
                                               requires_grad=False)
                    labeled_batch_W = Variable(labeled_batch_W.cuda(),
                                               requires_grad=False)

                # zero the parameter gradients
                optimizer.zero_grad()

                y_h1 = model(inputs)
                y_h2 = model(anchor_inputs)

                y = F.sigmoid(48 / num_bits * 0.4 *
                              torch.matmul(y_h1, y_h2.permute(1, 0)))

                loss, l_batch_loss, m_loss = criterion(
                    y, y_h1, y_h2, anchor_batch_S, anchor_batch_W,
                    labeled_batch_S, labeled_batch_W, zcomp_h1, zcomp_h2,
                    mask_flag, u_w_m, epoch, num_bits)

                h1[index, :] = y_h1.data.clone()
                h2[anchor_index, :] = y_h2.data.clone()

                # backward+optimize
                loss.backward()

                optimizer.step()

                running_loss += loss.item()

                Z_h2 = alpha_1 * Z_h2 + (1. - alpha_1) * h2
                z_h2 = Z_h2 * (1. / (1. - alpha_1**(epoch + 1)))

        print(
            "Epoch[{}]({}/{}): Time:(data {:.3f}/ batch {:.3f}) Loss_H: {:.4f}/{:.4f}/{:.4f}"
            .format(epoch, iteration, len(dataloader), data_time.val,
                    batch_time.val, loss.item(), l_batch_loss.item(),
                    m_loss.item()))

        Z_h1 = alpha * Z_h1 + (1. - alpha) * h1
        z_h1 = Z_h1 * (1. / (1. - alpha**(epoch + 1)))

        if epoch % 1 == 0:
            MAP = helpers.validate(model, dataloader_test, test_loader)

            print("Test image map is:{}".format(MAP))

            is_best = MAP > best_prec
            best_prec = max(best_prec, MAP)

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                prefix=arch,
                num_bits=num_bits,
                filename=checkpoint_filename)

    return model
示例#13
0
print('Running on', device)

print("Building Models")
print("SoftMax")
buildModel(SoftMax(), 0.1, True, True)

print("MLP")
buildModel(MLP(), 0.1, True, True)

print("DAE")
buildDAELayer(DAELayer(10304, 1000), lRate=1e-4, epochs=5000, plot=True)
buildDAELayer(DAELayer(1000, 300), lRate=1e-4, epochs=5000, plot=True)
buildDAESoftmaxModel(DAESoftMax(), lRate=1e-2, epochs=1000, plot=True)

print("CNN")
buildModel(CNN(), 0.001, True, True)

print("\nModel(s) is reconstructed with alpha =", 5000, "beta =", 100,
      "gamma =", 0.01, "delta =", 0.1)
print("Attacking Models")
# SoftMax Model from paper
print("Softmax")
reconstructionAttack(SoftMax())

# MLP Model from paper
print("MLP")
reconstructionAttack(MLP())

# DAE Model from paper
print("DAE")
reconstructionAttack(DAESoftMax())
示例#14
0
import sys, os
sys.path.append("..")
sys.path.extend([
    os.path.join(root, name) for root, dirs, _ in os.walk("../")
    for name in dirs
])

from _config import NNConfig
from networks import CNN, LSTM, Encoder, Decoder

nnconfig = NNConfig()
nnconfig.show()

cnn = CNN("cnn_layer1")
lstm = LSTM("lstm_layer1")
cnn.show()
lstm.show()

encoder = Encoder(cnn)
decoder = Decoder(lstm)
示例#15
0
import random

from networks import CNN

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Network File
path = "checkpoints/%s.pt" % sys.argv[1]

# Environment
env = gym.make("Breakout-v0")
env = wrappers.make_env(env)

# Network
q_network = CNN(4, env.action_space.n).to(device)
q_network.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
q_network.eval()


# Greedy Policy
@torch.no_grad()
def select_action(q, state):
    if random.random() < 0.05:
        print('Random')
        return env.action_space.sample()
    k = q(state.to(device))
    print(k)
    return k.max(1)[1].view(1, 1)

示例#16
0
from networks import CNN, SoftMax, MLP
from modules import buildModel, reconstructionAttack

print("Building Models")
print("SoftMax")
buildModel(SoftMax(), 0.1, True, True)
    
print("MLP")
buildModel(MLP(), 0.1, True, True)

print("DAE")
buildDAELayer(DAELayer(10304, 1000), lRate=1e-4, epochs=5000, plot=True)
buildDAELayer(DAELayer(1000, 300), lRate=1e-4, epochs=5000, plot=True)
buildDAESoftmaxModel(DAESoftMax(), lRate=1e-2, epochs=1000, plot=True)

print("CNN")
buildModel(CNN(), 0.001, True, True)
示例#17
0
    if args.headless:  # GPU optimization
        mpl.use('Agg')  # Suppress rendering

    import matplotlib.pyplot as plt

    env = gym.make(args.env)
    if not args.toy:
        env = FrameStackWrapper(env, args.frames)
        env = ResetLifeLostWrapper(env)

    epsilon_max = args.epsilon_max
    epsilon_min = args.epsilon_min
    epsilon_decay = args.epsilon_decay
    eps_threshold = 1

    total_moves = 0

    net = CNN(conv_channels=args.frames, n_actions=env.action_space.n) if not args.toy \
        else FCN(n_actions=env.action_space.n)
    if chainer.cuda.available:
        net.to_gpu()
    optim = optimizers.RMSpropGraves(lr=args.alpha, momentum=args.momentum)
    # optim = optimizers.RMSprop(lr=args.alpha)
    # optim = optimizers.Adam(alpha=args.alpha)
    optim.setup(net)

    if not os.path.exists("results/{}".format(args.env)):
        os.makedirs("results/{}".format(args.env))

    train()
示例#18
0
class Mind:
    def __init__(self,
                 n_actions,
                 input_shape,
                 save_path=None,
                 load_path=None,
                 action_inverses={},
                 update_interval=256 * 2,
                 save_interval=5):
        self.n_actions = n_actions
        self.save_path = save_path
        self.network = CNN(n_out=n_actions, input_shape=input_shape)

        if load_path != None:
            self.network.load(load_path)

        self.n_features = input_shape
        self.data = []
        self.current_episode_count = 1

        self.random_actions = 0

        self.last_action = None
        self.last_action_random = False

        self.action_inverses = action_inverses

        self.lifetime = 1
        self.update_interval = update_interval
        self.save_interval = save_interval
        self.n_updates = 1

    def q(self, state):
        return self.network.predict(np.expand_dims(np.array(state), axis=0))[0]

    def should_explore(self, state):
        if np.random.random() < 1000 / (1000 + self.lifetime):
            return True
        return False

    def explore_action(self, state):
        return np.random.randint(0, self.n_actions)

    def action(self, state):
        q = self.q(state)
        #        if self.last_action_random:
        #            if self.last_action in self.action_inverses:
        #                q[self.action_inverses[self.last_action]] = float('-inf')

        action = np.argmax(q)

        if self.should_explore(state):
            self.random_actions += 1
            action = self.explore_action(state)
            self.last_action_random = True
        else:
            self.last_action_random = False

        if self.lifetime % self.update_interval == 0:
            self.update(alpha=0.9)
            self.n_updates += 1
            if self.n_updates % self.save_interval == 0:
                if self.save_path != None:
                    self.save(self.save_path)
                    print('saved')

        self.last_action = action
        self.current_episode_count += 1
        self.lifetime += 1

        return action

    def save(self, path):
        self.network.save(path)

    def reset(self):
        self.count = 1
        print('Random actions: ', self.random_actions)
        self.random_actions = 0

    def q_target(self, reward, best_next, alpha):
        return reward + alpha * best_next

    def feedback(self, old_action, old_state, reward, new_state):
        self.data.append({
            'Q_max': np.max(self.q(new_state)),
            'reward': reward,
            'old_state': old_state,
            'old_action': old_action
        })

    def update(self, alpha=0.6):
        np.random.shuffle(self.data)
        samples = self.data
        self.data = []
        states = []
        ys = []

        for sample in samples:
            y = self.q(sample['old_state'])
            y[sample['old_action']] = self.q_target(sample['reward'],
                                                    best_next=sample['Q_max'],
                                                    alpha=alpha)
            #y[sample['old_action']]  = sarsa_target(sample['reward'], next_action = sample['Q_max'], alpha = alpha)

            states.append(sample['old_state'])

            ys.append(y)
        self.network.train(np.array(states), np.array(ys))
示例#19
0
class ReinforcementLearner:
    __metaclass__ = abc.ABCMeta
    lock = threading.Lock()

    # rl_method: 강화학습 기법을 의미, 이 값은 하위 클래스에 따라 달라진다. (DQNLener는 dq, A2CLener는 ac 등)
    # stock_code: 학습을 진행하는 주식 종목 코드
    # chart_data: 주식 일봉 차트(환경에 해당)
    # training_data: 학습을 위해서 전처리된 데이터
    # min_trading_unit: 투자 최소 단위
    # max_trading_unit: 투자 최대 단위
    # delayed_reward_threshold: 지연 보상 임계값, 수익 or 손실률이 임계값보다 크면 지연 보상이 발생
    # mini_batch_size: ??
    # net: 신경망 종류, 이 값에 따라서, 가치 신경망, 정챙신경망으로 사용할 신경망 클래스가 달라짐
    # n_steps: LSTM, CNN 신경망에서 사용하는 샘플 묶음 크기
    # lr: (learn rate?), 학습 속도, 너무 크면 학습이 진행 안되고, 너무 작으면 학습이 오래 걸림
    # value_network, policy_network: 값을 들어오는 경우, 해당 모델을 가치 신경망, 정책신경망으로 사용
    # output_path: 학습 과정에서 발생하는 로그, 가시화 결과 및 학습 종료 후 저장되는 신겯망 모델 파일의 저장 위치 결정
    def __init__(self,
                 rl_method='rl',
                 stock_code=None,
                 chart_data=None,
                 training_data=None,
                 min_trading_unit=1,
                 max_trading_unit=2,
                 delayed_reward_threshold=.05,
                 net='dnn',
                 num_steps=1,
                 lr=0.001,
                 value_network=None,
                 policy_network=None,
                 output_path='',
                 reuse_models=True):
        # 인자 확인
        assert min_trading_unit > 0
        assert max_trading_unit > 0
        assert max_trading_unit >= min_trading_unit
        assert num_steps > 0
        assert lr > 0
        # 강화학습 기법 설정
        self.rl_method = rl_method
        # 환경 설정
        self.stock_code = stock_code  # 강화학습 대상이 되는 주식 종목 코드
        self.chart_data = chart_data  # 주식 종목의 차트 데이터
        self.environment = Environment(chart_data)  # 강화학습 환경 객체
        # 에이전트 설정
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold)
        # 학습 데이터
        self.training_data = training_data
        self.sample = None
        self.training_data_idx = -1
        # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기
        self.num_features = self.agent.STATE_DIM
        if self.training_data is not None:
            self.num_features += self.training_data.shape[1]
        # 신경망 클래스 객체는, 본 클래스의 하위 클래스에서 생성
        # 신경망 설정
        self.net = net
        self.num_steps = num_steps
        self.lr = lr
        self.value_network = value_network  # 가치 신경망
        self.policy_network = policy_network  # 정책 신경망
        self.reuse_models = reuse_models
        # 가시화 모듈
        self.visualizer = Visualizer()
        # 메모리
        # 강화 학습 과정에서 발생하는 각종 데이터를 쌓아두기 위해서, memory라는 변수 정의
        self.memory_sample = []  # 학습 데이터 샘플
        self.memory_action = []  # 수행한 행동
        self.memory_reward = []  # 획득한 보상
        self.memory_value = []  # 행동의 예측 가치
        self.memory_policy = []  # 핻동의 예측?확률?
        self.memory_pv = []  # 포트폴리오 가치
        self.memory_num_stocks = []  # 보유 주식수
        self.memory_exp_idx = []  # 탐험 위치
        self.memory_learning_idx = []  # 학습 위치
        # 에포크 관련 정보
        self.loss = 0.  # 손실
        self.itr_cnt = 0  # 수익발생 횟수
        self.exploration_cnt = 0  # 탐험 횟수
        self.batch_size = 0  # 배치 크기?
        self.learning_cnt = 0  # 학습 횟수
        # 로그 등 출력 경로
        self.output_path = output_path

    # 가치 신경망 생성 함수
    # 팩토리 함수 느낌
    # 가치 신경망은, 손익율을 회귀분석하는 모델로 보면 된다. 따라서, activation은 선형, loss는 mse이다.???
    def init_value_network(self,
                           shared_network=None,
                           activation='linear',
                           loss='mse'):
        if self.net == 'dnn':
            self.value_network = DNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'lstm':
            self.value_network = LSTMNetwork(input_dim=self.num_features,
                                             output_dim=self.agent.NUM_ACTIONS,
                                             lr=self.lr,
                                             num_steps=self.num_steps,
                                             shared_network=shared_network,
                                             activation=activation,
                                             loss=loss)
        elif self.net == 'cnn':
            self.value_network = CNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     num_steps=self.num_steps,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        if self.reuse_models and \
            os.path.exists(self.value_network_path): # reuse_models이 True이고, value_network_path 값이 있으면 신경망 모델 파일을 불러온다...
            self.value_network.load_model(model_path=self.value_network_path)

    # 정책 신경망 생성 함수
    # activation이 sigmoid로 다르다.
    # 정책신경망은 PV을 높이기 위해 취하기 좋은 행동에 대한 '분류' 모델
    # 활성 함수로 sigmoid을 써서 0 ~ 1 시아의 값으로 확률로 사용하기 위함
    def init_policy_network(self,
                            shared_network=None,
                            activation='sigmoid',
                            loss='binary_crossentropy'):
        if self.net == 'dnn':
            self.policy_network = DNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        elif self.net == 'lstm':
            self.policy_network = LSTMNetwork(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr,
                num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation,
                loss=loss)
        elif self.net == 'cnn':
            self.policy_network = CNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      num_steps=self.num_steps,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        if self.reuse_models and \
            os.path.exists(self.policy_network_path):
            self.policy_network.load_model(model_path=self.policy_network_path)

    # 초기화 함수
    # 에포크 초기화 함수
    # 에포크마다 데이터가 새로 쌓이는 변수들을 초기화 한다.
    def reset(self):
        self.sample = None  # 읽어온 학습 데이터가 샘플에 할당됨(초기화에선 None)
        self.training_data_idx = -1  # 학습 데이터를 처음부터 다시 읽기위해서 -1로 설정
        # 환경 초기화
        self.environment.reset()  # 환경클래스의 reset호출
        # 에이전트 초기화
        self.agent.reset()  # 에이전트가 제공하는 reset호출
        # 가시화 초기화
        self.visualizer.clear([0, len(self.chart_data)])  # 가시화 클래스의 clear호출
        # 메모리 초기화
        self.memory_sample = []
        self.memory_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보 초기화
        self.loss = 0.  # 신경망의 결과가 학습 데이터와 얼마나 차이가 있는지를 저장하는 변수 loss가 줄어야 좋은거임!
        self.itr_cnt = 0  # 수행한 에포크 수를 저장
        self.exploration_cnt = 0  # 탐험 수 저장, epsilon이 0.1dlrh 100번 투자 결정이 있다고 한다면 약 10번의 무작위 투자
        self.batch_size = 0  # 학습할 미니 배치 크기
        self.learning_cnt = 0  # 한 에포크 동안 수행한 미니 배치 학습 횟수

    # 환경 객체에서 샘플을 획득하는 함수
    # 학습 데이터플 구성하는 샘플 하나를 생성하는 함수
    def build_sample(self):
        self.environment.observe()  # 차트 데이터의 현재 인덱스에서, 다음 인덱스 데이터를 읽게한다.
        if len(self.training_data
               ) > self.training_data_idx + 1:  # 학습 데이터 존재 확인
            self.training_data_idx += 1
            self.sample = self.training_data.iloc[
                self.training_data_idx].tolist()  # 샘플 가져옴, 샘플은 26개의 값임
            self.sample.extend(
                self.agent.get_states())  # 에이전트에서 2개 값을 추가! (28개값!)
            return self.sample
        return None

    # 배치 학습 데이터 생성 함수, 추상 메소드로 하위 클래스가 반드시 구현해야 한다.!
    @abc.abstractmethod
    def get_batch(self, batch_size, delayed_reward, discount_factor):
        pass

    # 가치 신경망 및 정책 신경망 학습 함수
    # get_batch을 호출해서 배치 학습 데이터를 생성
    # 가치 신경망 및 정책 신경망의 train_on_batch을 호출하여, 학습 시킴
    # 가치 신경망: DQN, AC, A2C
    # 정책 신경망: PolicyGradient, AC, A2C
    def update_networks(self, batch_size, delayed_reward, discount_factor):
        # 배치 학습 데이터 생성
        x, y_value, y_policy = self.get_batch(batch_size, delayed_reward,
                                              discount_factor)
        if len(x) > 0:
            loss = 0
            if y_value is not None:
                # 가치 신경망 갱신
                loss += self.value_network.train_on_batch(x, y_value)
            if y_policy is not None:
                # 정책 신경망 갱신
                loss += self.policy_network.train_on_batch(x, y_policy)
            return loss  # 학습 후 손실 반환
        return None

    # 가치 신경망 및 정책 신경망 학습 요청 함수
    # 배치 학습 데이터의 크기를 정하고, update_networks 호출(위함수)
    # _loss에 로 총 loss을 생성
    def fit(self, delayed_reward, discount_factor, full=False):
        batch_size = len(self.memory_reward) if full \
            else self.batch_size
        # 배치 학습 데이터 생성 및 신경망 갱신
        if batch_size > 0:
            _loss = self.update_networks(batch_size, delayed_reward,
                                         discount_factor)
            if _loss is not None:
                self.loss += abs(_loss)
                self.learning_cnt += 1  # 학습 횟수 저장, loss / learning_cnt하면 에포크의 학습 손실을 알 수 있음
                self.memory_learning_idx.append(
                    self.training_data_idx)  # 학습 위치 저장
            self.batch_size = 0

    # 에포크 정보 가시화 함수
    def visualize(self, epoch_str, num_epoches, epsilon):
        self.memory_action = [Agent.ACTION_HOLD] \
            * (self.num_steps - 1) + self.memory_action
        self.memory_num_stocks = [0] * (self.num_steps - 1) \
            + self.memory_num_stocks
        if self.value_network is not None:
            self.memory_value = [np.array([np.nan] \
                * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                    + self.memory_value
        if self.policy_network is not None:
            self.memory_policy = [np.array([np.nan] \
                * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                    + self.memory_policy
        self.memory_pv = [self.agent.initial_balance] \
            * (self.num_steps - 1) + self.memory_pv
        self.visualizer.plot(
            epoch_str=epoch_str,
            num_epoches=num_epoches,
            epsilon=epsilon,
            action_list=Agent.ACTIONS,
            actions=self.memory_action,
            num_stocks=self.memory_num_stocks,
            outvals_value=self.memory_value,
            outvals_policy=self.memory_policy,
            exps=self.memory_exp_idx,
            learning_idxes=self.memory_learning_idx,
            initial_balance=self.agent.initial_balance,
            pvs=self.memory_pv,
        )
        self.visualizer.save(
            os.path.join(self.epoch_summary_dir,
                         'epoch_summary_{}.png'.format(epoch_str)))

    # 강화학습 수행 함수
    # 핵심 함수!
    def run(
        self,
        num_epoches=100,  # 총 수행할 반복 학습 횟수, 너무 크면 학습에 걸리는 시간이 길어짐
        balance=10000000,  # 초기 투자금
        discount_factor=0.9,  # 상태-행동 가치를 구할때 적용할 할인율, 과거로 갈수록 현재 보상을 약하게 적용한다.
        start_epsilon=0.5,  # 초기 탐험 비율
        learning=True  # 학습을 마치면 학습된 가치 신경망모델, 정책 신경망 모델이 생성된다. 이런 신경망 모델으 만들꺼면 True, 이미 학습된 모델로, 투자 시뮬레이션일때는 False
    ):
        info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \
            "DF:{discount_factor} TU:[{min_trading_unit}," \
            "{max_trading_unit}] DRT:{delayed_reward_threshold}".format(
            code=self.stock_code, rl=self.rl_method, net=self.net,
            lr=self.lr, discount_factor=discount_factor,
            min_trading_unit=self.agent.min_trading_unit,
            max_trading_unit=self.agent.max_trading_unit,
            delayed_reward_threshold=self.agent.delayed_reward_threshold
        )
        with self.lock:
            logging.info(info)  # 강화 학습의 설정값을 로깅 한다.

        # 시작 시간
        time_start = time.time()

        # 가시화 준비
        # 차트 데이터는 변하지 않으므로 미리 가시화
        self.visualizer.prepare(self.environment.chart_data, info)

        # 가시화 결과 저장할 폴더 준비
        # epoch_summary_ 라는 폴더에 저장
        self.epoch_summary_dir = os.path.join(
            self.output_path, 'epoch_summary_{}'.format(self.stock_code))
        if not os.path.isdir(self.epoch_summary_dir):
            os.makedirs(self.epoch_summary_dir)
        else:
            for f in os.listdir(self.epoch_summary_dir):
                os.remove(os.path.join(self.epoch_summary_dir, f))

        # 에이전트 초기 자본금 설정
        self.agent.set_balance(balance)

        # 학습에 대한 정보 초기화
        max_portfolio_value = 0
        epoch_win_cnt = 0

        # 학습 반복
        for epoch in range(num_epoches):
            time_start_epoch = time.time()

            # step 샘플을 만들기 위한 큐
            # deque사용 - 참고: https://opensourcedev.tistory.com/3
            q_sample = collections.deque(maxlen=self.num_steps)

            # 환경, 에이전트, 신경망, 가시화, 메모리 초기화
            self.reset()

            # 학습을 진행할 수록 탐험 비율 감소
            if learning:
                epsilon = start_epsilon \
                    * (1. - float(epoch) / (num_epoches - 1))
                self.agent.reset_exploration()
            else:
                epsilon = start_epsilon
                self.agent.reset_exploration(alpha=0)

            while True:
                # 샘플 생성
                next_sample = self.build_sample()
                if next_sample is None:
                    break  # 샘플 만큼 while문 반복

                # num_steps만큼 샘플 저장
                q_sample.append(next_sample)
                if len(q_sample) < self.num_steps:
                    continue

                # 가치, 정책 신경망 예측
                # 각 신경망의 predict함수 호출
                pred_value = None
                pred_policy = None
                if self.value_network is not None:
                    pred_value = self.value_network.predict(list(q_sample))
                if self.policy_network is not None:
                    pred_policy = self.policy_network.predict(list(q_sample))

                # 신경망 또는 탐험에 의한 행동 결정
                # 행동, 결정에 대한 확신도, 무작위 탐험 유무
                action, confidence, exploration = \
                    self.agent.decide_action(
                        pred_value, pred_policy, epsilon)

                # 결정한 행동을 수행하고 즉시 보상과 지연 보상 획득
                immediate_reward, delayed_reward = \
                    self.agent.act(action, confidence)

                # 행동 및 행동에 대한 결과를 기억
                self.memory_sample.append(list(q_sample))
                self.memory_action.append(action)
                self.memory_reward.append(immediate_reward)
                if self.value_network is not None:
                    self.memory_value.append(pred_value)
                if self.policy_network is not None:
                    self.memory_policy.append(pred_policy)
                self.memory_pv.append(self.agent.portfolio_value)
                self.memory_num_stocks.append(self.agent.num_stocks)
                if exploration:
                    self.memory_exp_idx.append(self.training_data_idx)

                # 반복에 대한 정보 갱신
                self.batch_size += 1
                self.itr_cnt += 1
                self.exploration_cnt += 1 if exploration else 0  # 3항연산???

                # 지연 보상 발생된 경우 미니 배치 학습
                # 지연보상은 지연보상 임계치가 넘는 손익률이 발생하면 주어진다.
                if learning and (delayed_reward != 0):
                    self.fit(delayed_reward, discount_factor)

            # 에포크 종료 후 학습 (while문 이후 미니 배치 학습)
            if learning:
                self.fit(self.agent.profitloss, discount_factor, full=True)

            # 에포크 관련 정보 로그 기록
            num_epoches_digit = len(str(num_epoches))
            epoch_str = str(epoch + 1).rjust(num_epoches_digit,
                                             '0')  # 문자열을 자리수에 맞게 정렬(우측) 함수
            time_end_epoch = time.time()
            elapsed_time_epoch = time_end_epoch - time_start_epoch
            if self.learning_cnt > 0:
                self.loss /= self.learning_cnt
            logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} "
                         "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} "
                         "#Stocks:{} PV:{:,.0f} "
                         "LC:{} Loss:{:.6f} ET:{:.4f}".format(
                             self.stock_code, epoch_str, num_epoches, epsilon,
                             self.exploration_cnt, self.itr_cnt,
                             self.agent.num_buy, self.agent.num_sell,
                             self.agent.num_hold, self.agent.num_stocks,
                             self.agent.portfolio_value, self.learning_cnt,
                             self.loss, elapsed_time_epoch))

            # 에포크 관련 정보 가시화
            self.visualize(epoch_str, num_epoches, epsilon)

            # 학습 관련 정보 갱신
            max_portfolio_value = max(max_portfolio_value,
                                      self.agent.portfolio_value)
            if self.agent.portfolio_value > self.agent.initial_balance:
                epoch_win_cnt += 1

        # 종료 시간
        time_end = time.time()
        elapsed_time = time_end - time_start

        # 학습 관련 정보 로그 기록
        with self.lock:
            logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} "
                         "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format(
                             code=self.stock_code,
                             elapsed_time=elapsed_time,
                             max_pv=max_portfolio_value,
                             cnt_win=epoch_win_cnt))

    # 가치 신경망 및 정책 신경망 저장 함수
    def save_models(self):
        if self.value_network is not None and \
                self.value_network_path is not None:
            self.value_network.save_model(self.value_network_path)
        if self.policy_network is not None and \
                self.policy_network_path is not None:
            self.policy_network.save_model(self.policy_network_path)
示例#20
0
if model == 'all':
    # SoftMax Model from paper
    print("Softmax")
    reconstructionAttack(SoftMax(), alpha, beta, gamma, delta, True, False)

    # MLP Model from paper
    print("MLP")
    reconstructionAttack(MLP(), alpha, beta, gamma, delta, True, False)

    # DAE Model from paper
    print("DAE")
    reconstructionAttack(DAESoftMax(), alpha, beta, gamma, delta, True, False)

    # CNN for comparison
    print("CNN")
    reconstructionAttack(CNN(), alpha, beta, gamma, delta, True, False)
else:
    if model == 'Softmax':
        # SoftMax Model from paper
        print("Softmax")
        reconstructionAttack(SoftMax(), alpha, beta, gamma, delta, True, False)

    if model == 'MLP':
        # MLP Model from paper
        print("MLP")
        reconstructionAttack(MLP(), alpha, beta, gamma, delta, True, False)

    if model == 'DAE':
        # DAE Model from paper
        print("DAE")
        reconstructionAttack(DAESoftMax(), alpha, beta, gamma, delta, True,