def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): # reuse_models이 True이고, value_network_path 값이 있으면 신경망 모델 파일을 불러온다... self.value_network.load_model(model_path=self.value_network_path)
def build_models(args, device='cuda'): models = {} models['encodercnn'] = CNN(input_shape=RGB_INPUT_SHAPE, model_name=args.encoder_cnn_model).to(device) models['encoder'] = Encoder(input_shape=models['encodercnn'].out_size, encoder_block='convbilstm', hidden_size=args.encoder_hid_size).to(device) models['crossviewdecodercnn'] = CNN(input_shape=DEPTH_INPUT_SHAPE, model_name=args.encoder_cnn_model, input_channel=1).to(device) crossviewdecoder_in_size = list(models['crossviewdecodercnn'].out_size) crossviewdecoder_in_size[0] = crossviewdecoder_in_size[0] * 3 crossviewdecoder_in_size = torch.Size(crossviewdecoder_in_size) models['crossviewdecoder'] = CrossViewDecoder( input_shape=crossviewdecoder_in_size).to(device) models['reconstructiondecoder'] = ReconstructionDecoder( input_shape=models['encoder'].out_size[1:]).to(device) models['viewclassifier'] = ViewClassifier( input_size=reduce(operator.mul, models['encoder'].out_size[1:]), num_classes=5, reverse=(not args.disable_grl)).to(device) return models
def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.policy_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path)
def __init__(self, n_actions, input_shape, save_path=None, load_path=None, action_inverses={}, update_interval=256 * 2, save_interval=5): self.n_actions = n_actions self.save_path = save_path self.network = CNN(n_out=n_actions, input_shape=input_shape) if load_path != None: self.network.load(load_path) self.n_features = input_shape self.data = [] self.current_episode_count = 1 self.random_actions = 0 self.last_action = None self.last_action_random = False self.action_inverses = action_inverses self.lifetime = 1 self.update_interval = update_interval self.save_interval = save_interval self.n_updates = 1
def do_stuff(opt): print(f'\nTraining {opt} for {args.num_epochs} epochs...') net = CNN() if args.dataset == 'cifar' else MLP() _, kwargs = misc.split_optim_dict(misc.optim_dict[opt]) optimizer = misc.task_to_optimizer(opt)(params=net.parameters(), **kwargs) optimizer = misc.wrap_optimizer(opt, optimizer) return fit(net, data, optimizer, num_epochs=args.num_epochs, lr_schedule=True)
def do_stuff(opt): print(f'\nTraining {opt} for {args.num_epochs} epochs...') net = CNN() if args.dataset == 'cifar' else MLP() _, kwargs = misc.split_optim_dict(misc.optim_dict[opt]) optimizer = misc.task_to_optimizer(opt)(params=net.parameters(), **kwargs) if 'lookahead' in opt.lower(): optimizer = optimizers.Lookahead(optimizer, k=5, alpha=0.5) return fit(net, data, optimizer, num_epochs=args.num_epochs, lr_schedule=True)
def define_agent(self, width, height, num_actions): return NStepDQNAgent(config=Config( num_actions=num_actions, encoder=LayerEncoder(width, height, treasure_position=True), optimizer=SharedAdamOptimizer(0.001), network=CNN(hidden_units=[128]), policy=EpsilonGreedyPolicy(1, 0.01, 25000), discount=0.95, n_step=16))
def define_agent(self, width, height, num_actions): return DQNAgent( config=Config( num_actions=num_actions, encoder=LayerEncoder(width, height, treasure_position=True), optimizer=AdamOptimizer(0.001), network=CNN(hidden_units=[128]), policy=EpsilonGreedyPolicy(1, 0.01, 50000), discount=0.95, capacity=10000, batch_size=8, target_sync=100, double_q=True))
def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'ddpg': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path)
labels.append(label.item()) p, r, f, _ = report(labels, preds2) ps_2 += [p] rs_2 += [r] fs_2 += [f] # ''' print('1.5+ PRF on', name, ':') print('\tprecision:', p) print('\trecall:', r) print('\tf score:', f) # ''' return ps_1, rs_1, fs_1, ps_2, rs_2, fs_2 if __name__ == '__main__': cnn = CNN('./cnn_config_1.5T_optimal.json', 0) #valid_optimal_accu = cnn.train() cnn.epoch=146 cnn.model.load_state_dict(torch.load('{}CNN_{}.pth'.format(cnn.checkpoint_dir, cnn.epoch))) cnn.test() # valid = 89.02 # test = 84.15 # gan = GAN('./gan_config_optimal.json', 0) #gan.train() #''' # gan.optimal_epoch=105 # gan.netG.load_state_dict(torch.load('{}G_{}.pth'.format(gan.checkpoint_dir, gan.optimal_epoch))) # print(gan.valid_model_epoch()) #gan.test(False) # gan.eval_iqa(zoom=False, metric='brisque')
class ReinforcementLearner: __metaclass__ = abc.ABCMeta lock = threading.Lock() def __init__(self, rl_method='rl', stock_code=None, chart_data=None, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, net='dnn', num_steps=1, lr=0.001, value_network=None, policy_network=None, output_path='', reuse_models=True): # 인자 확인 assert min_trading_unit > 0 assert max_trading_unit > 0 assert max_trading_unit >= min_trading_unit assert num_steps > 0 assert lr > 0 # 강화학습 기법 설정 self.rl_method = rl_method # 환경 설정 self.stock_code = stock_code self.chart_data = chart_data self.environment = Environment(chart_data) # 에이전트 설정 self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) # 학습 데이터 self.training_data = training_data self.sample = None self.training_data_idx = -1 # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기 self.num_features = self.agent.STATE_DIM if self.training_data is not None: self.num_features += self.training_data.shape[1] # 신경망 설정 self.net = net self.num_steps = num_steps self.lr = lr self.value_network = value_network self.policy_network = policy_network self.reuse_models = reuse_models self.critic = value_network self.actor = policy_network self.tau = 0.001 # 가시화 모듈 self.visualizer = Visualizer() # 메모리 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 # 로그 등 출력 경로 self.output_path = output_path def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.rl_method == 'ddpg': print("actor") self.actor = ActorNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) print(self.actor) elif self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.policy_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.policy_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path) def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'ddpg': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path) def reset(self): self.sample = None self.training_data_idx = -1 # 환경 초기화 self.environment.reset() # 에이전트 초기화 self.agent.reset() # 가시화 초기화 self.visualizer.clear([0, len(self.chart_data)]) # 메모리 초기화 self.memory_sample = [] self.memory_action = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 초기화 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 def build_sample(self): self.environment.observe() if len(self.training_data) > self.training_data_idx + 1: self.training_data_idx += 1 self.sample = self.training_data.iloc[ self.training_data_idx].tolist() self.sample.extend(self.agent.get_states()) return self.sample return None @abc.abstractmethod def get_batch(self, batch_size, delayed_reward, discount_factor): pass @abc.abstractmethod def train(self, batch_size, delayed_reward, discount_factor): pass def update_networks(self, batch_size, delayed_reward, discount_factor): # 배치 학습 데이터 생성 x, y_value, y_policy = self.get_batch(batch_size, delayed_reward, discount_factor) if len(x) > 0: loss = 0 if y_value is not None: # 가치 신경망 갱신 loss += self.critic.train_on_batch(x, y_value) self.critic.transfer_weights() if y_policy is not None: # 정책 신경망 갱신 loss += self.actor.train_on_batch(x, y_policy) self.actor.transfer_weights() return loss return None def fit(self, delayed_reward, discount_factor, full=False): batch_size = len(self.memory_reward) if full \ else self.batch_size # 배치 학습 데이터 생성 및 신경망 갱신 if batch_size > 0: _loss = self.update_networks(batch_size, delayed_reward, discount_factor) if _loss is not None: self.loss += abs(_loss) self.learning_cnt += 1 self.memory_learning_idx.append(self.training_data_idx) self.batch_size = 0 def visualize(self, epoch_str, num_epoches, epsilon): self.memory_action = [Agent.ACTION_HOLD] \ * (self.num_steps - 1) + self.memory_action self.memory_num_stocks = [0] * (self.num_steps - 1) \ + self.memory_num_stocks if self.value_network is not None: self.memory_value = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_value if self.policy_network is not None: self.memory_policy = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_policy self.memory_pv = [self.agent.initial_balance] \ * (self.num_steps - 1) + self.memory_pv self.visualizer.plot( epoch_str=epoch_str, num_epoches=num_epoches, epsilon=epsilon, action_list=Agent.ACTIONS, actions=self.memory_action, num_stocks=self.memory_num_stocks, outvals_value=self.memory_value, outvals_policy=self.memory_policy, exps=self.memory_exp_idx, learning_idxes=self.memory_learning_idx, initial_balance=self.agent.initial_balance, pvs=self.memory_pv, ) self.visualizer.save( os.path.join(self.epoch_summary_dir, 'epoch_summary_{}.png'.format(epoch_str))) def run(self, num_epoches=100, balance=10000000, discount_factor=0.9, start_epsilon=0.5, learning=True): info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \ "DF:{discount_factor} TU:[{min_trading_unit}," \ "{max_trading_unit}] DRT:{delayed_reward_threshold}".format( code=self.stock_code, rl=self.rl_method, net=self.net, lr=self.lr, discount_factor=discount_factor, min_trading_unit=self.agent.min_trading_unit, max_trading_unit=self.agent.max_trading_unit, delayed_reward_threshold=self.agent.delayed_reward_threshold ) with self.lock: logging.info(info) # 시작 시간 time_start = time.time() # 가시화 준비 # 차트 데이터는 변하지 않으므로 미리 가시화 self.visualizer.prepare(self.environment.chart_data, info) # 가시화 결과 저장할 폴더 준비 self.epoch_summary_dir = os.path.join( self.output_path, 'epoch_summary_{}'.format(self.stock_code)) if not os.path.isdir(self.epoch_summary_dir): os.makedirs(self.epoch_summary_dir) else: for f in os.listdir(self.epoch_summary_dir): os.remove(os.path.join(self.epoch_summary_dir, f)) # 에이전트 초기 자본금 설정 self.agent.set_balance(balance) # 학습에 대한 정보 max_portfolio_value = 0 epoch_win_cnt = 0 # 학습 반복 for epoch in range(num_epoches): time_start_epoch = time.time() # step 샘플을 만들기 위한 큐 q_sample = collections.deque(maxlen=self.num_steps) # 환경, 에이전트, 신경망, 가시화, 메모리 초기화 self.reset() # 학습을 진행할 수록 탐험 비율 감소 if learning: epsilon = start_epsilon \ * (1. - float(epoch) / (num_epoches - 1)) self.agent.reset_exploration() else: epsilon = start_epsilon while True: # 샘플 생성 next_sample = self.build_sample() if next_sample is None: break # num_steps만큼 샘플 저장 q_sample.append(next_sample) if len(q_sample) < self.num_steps: continue # 가치, 정책 신경망 예측 pred_value = None pred_policy = None pred_target_policy = None pred_target_value = None if self.critic is not None: pred_value = self.critic.predict(list(q_sample)) pred_target_value = self.critic.target_predict( list(q_sample)) if self.actor is not None: pred_policy = self.actor.predict(list(q_sample)) pred_target_policy = self.actor.target_predict( list(q_sample)) # 신경망 또는 탐험에 의한 행동 결정 action, confidence, exploration = \ self.agent.decide_action(pred_value, pred_policy, epsilon) # target 값을 이용한 행동 결정 target_action, target_confidence, target_exploration = \ self.agent.decide_action(pred_target_policy, pred_target_value, epsilon) #결정한 행동을 수행하고 즉시 보상과 지연 보상 획득 immediate_reward, delayed_reward = \ self.agent.act(action, confidence) # 행동 및 행동에 대한 결과를 기억 self.memory_sample.append(list(q_sample)) self.memory_action.append(action) self.memory_reward.append(immediate_reward) self.memory_target_action.append(target_action) self.memory_target_policy.append(pred_target_policy) self.memory_target_value.append(pred_target_value) if self.value_network is not None: self.memory_value.append(pred_value) if self.policy_network is not None: self.memory_policy.append(pred_policy) self.memory_pv.append(self.agent.portfolio_value) self.memory_num_stocks.append(self.agent.num_stocks) if exploration: self.memory_exp_idx.append(self.training_data_idx) # 반복에 대한 정보 갱신 self.batch_size += 1 self.itr_cnt += 1 self.exploration_cnt += 1 if exploration else 0 # 지연 보상 발생된 경우 미니 배치 학습 if learning and (delayed_reward != 0): self.fit(delayed_reward, discount_factor) # 에포크 종료 후 학습 if learning: self.fit(self.agent.profitloss, discount_factor, full=True) # 에포크 관련 정보 로그 기록 num_epoches_digit = len(str(num_epoches)) epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0') time_end_epoch = time.time() elapsed_time_epoch = time_end_epoch - time_start_epoch if self.learning_cnt > 0: logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} " "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} " "#Stocks:{} PV:{:,.0f} " "LC:{} Loss:{:.6f} ET:{:.4f}".format( self.stock_code, epoch_str, num_epoches, epsilon, self.exploration_cnt, self.itr_cnt, self.agent.num_buy, self.agent.num_sell, self.agent.num_hold, self.agent.num_stocks, self.agent.portfolio_value, self.learning_cnt, self.loss, elapsed_time_epoch)) # 에포크 관련 정보 가시화 self.visualize(epoch_str, num_epoches, epsilon) # 학습 관련 정보 갱신 max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # 종료 시간 time_end = time.time() elapsed_time = time_end - time_start # 학습 관련 정보 로그 기록 with self.lock: logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} " "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format( code=self.stock_code, elapsed_time=elapsed_time, max_pv=max_portfolio_value, cnt_win=epoch_win_cnt)) def save_models(self): if self.value_network is not None and \ self.value_network_path is not None: self.value_network.save_model(self.value_network_path) if self.policy_network is not None and \ self.policy_network_path is not None: self.policy_network.save_model(self.policy_network_path)
def pre_train(dataloader, test_loader, dict_loader, dataloader_test, mask_labels, total_epochs=50, learning_rate=1e-4, use_gpu=True, seed=123): args = parser.parse_args() pprint(args) num_bits = args.num_bits model = CNN(model_name='alexnet', bit=num_bits, class_num=args.num_class) criterion = custom_loss(num_bits=num_bits) arch = 'cnn_' filename = arch + args.dataset + '_' + str(num_bits) + "bits" checkpoint_filename = os.path.join(args.checkpoint, filename + '.pt') if use_gpu: model = model.cuda() model = torch.nn.DataParallel(model, device_ids=range( torch.cuda.device_count())) criterion = criterion.cuda() torch.cuda.manual_seed(seed) running_loss = 0.0 start_epoch = 0 batch_time = AverageMeter() data_time = AverageMeter() end = time.time() best_prec = -99999 k = 10500 n_samples = 200000 alpha = 0.4 alpha_1 = 0.99 mask_labels = torch.from_numpy(mask_labels).long().cuda() Z_h1 = torch.zeros(n_samples, num_bits).float().cuda() # intermediate values z_h1 = torch.zeros(n_samples, num_bits).float().cuda() # temporal outputs h1 = torch.zeros(n_samples, num_bits).float().cuda() # current outputs Z_h2 = torch.zeros(args.anchor_num, num_bits).float().cuda() # intermediate values z_h2 = torch.zeros(args.anchor_num, num_bits).float().cuda() # temporal outputs h2 = torch.zeros(args.anchor_num, num_bits).float().cuda() # current outputs for epoch in range(start_epoch, total_epochs): model.train(True) rampup_value = rampup(epoch) rampdown_value = rampdown(epoch) learning_rate = rampup_value * rampdown_value * 0.00005 adam_beta1 = rampdown_value * 0.9 + (1.0 - rampdown_value) * 0.5 adam_beta2 = step_rampup(epoch) * 0.99 + (1 - step_rampup(epoch)) * 0.999 if epoch == 0: u_w = 0.0 else: u_w = rampup_value u_w_m = u_w * 5 u_w_m = torch.autograd.Variable(torch.FloatTensor([u_w_m]).cuda(), requires_grad=False) optimizer = Adam(model.parameters(), lr=learning_rate, betas=(adam_beta1, adam_beta2), eps=1e-8, amsgrad=True) anchors_data, anchor_Label = generate_anchor_vectors(dict_loader) for iteration, data in enumerate(dataloader, 0): anchor_index = np.arange(args.anchor_num) np.random.shuffle(anchor_index) anchor_index = anchor_index[:100] anchor_index = torch.from_numpy(anchor_index).long().cuda() anchor_inputs = anchors_data[anchor_index, :, :, :] anchor_labels = anchor_Label[anchor_index, :] inputs, labels, index = data['image'], data['labels'], data[ 'index'] labels = labels.float() mask_flag = Variable(mask_labels[index], requires_grad=False) idx = (mask_flag > 0) if index.shape[0] == args.batch_size: anchor_batch_S, anchor_batch_W = CalcSim( labels[idx, :].cuda(), anchor_labels.cuda()) if inputs.size(3) == 3: inputs = inputs.permute(0, 3, 1, 2) inputs = inputs.type(torch.FloatTensor) zcomp_h1 = z_h1[index.cuda(), :] zcomp_h2 = z_h2[anchor_index, :] labeled_batch_S, labeled_batch_W = CalcSim( labels[idx, :].cuda(), labels[idx, :].cuda()) if use_gpu: inputs = Variable(inputs.cuda(), requires_grad=False) anchor_batch_S = Variable(anchor_batch_S.cuda(), requires_grad=False) anchor_batch_W = Variable(anchor_batch_W.cuda(), requires_grad=False) labeled_batch_S = Variable(labeled_batch_S.cuda(), requires_grad=False) labeled_batch_W = Variable(labeled_batch_W.cuda(), requires_grad=False) # zero the parameter gradients optimizer.zero_grad() y_h1 = model(inputs) y_h2 = model(anchor_inputs) y = F.sigmoid(48 / num_bits * 0.4 * torch.matmul(y_h1, y_h2.permute(1, 0))) loss, l_batch_loss, m_loss = criterion( y, y_h1, y_h2, anchor_batch_S, anchor_batch_W, labeled_batch_S, labeled_batch_W, zcomp_h1, zcomp_h2, mask_flag, u_w_m, epoch, num_bits) h1[index, :] = y_h1.data.clone() h2[anchor_index, :] = y_h2.data.clone() # backward+optimize loss.backward() optimizer.step() running_loss += loss.item() Z_h2 = alpha_1 * Z_h2 + (1. - alpha_1) * h2 z_h2 = Z_h2 * (1. / (1. - alpha_1**(epoch + 1))) print( "Epoch[{}]({}/{}): Time:(data {:.3f}/ batch {:.3f}) Loss_H: {:.4f}/{:.4f}/{:.4f}" .format(epoch, iteration, len(dataloader), data_time.val, batch_time.val, loss.item(), l_batch_loss.item(), m_loss.item())) Z_h1 = alpha * Z_h1 + (1. - alpha) * h1 z_h1 = Z_h1 * (1. / (1. - alpha**(epoch + 1))) if epoch % 1 == 0: MAP = helpers.validate(model, dataloader_test, test_loader) print("Test image map is:{}".format(MAP)) is_best = MAP > best_prec best_prec = max(best_prec, MAP) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best, prefix=arch, num_bits=num_bits, filename=checkpoint_filename) return model
print('Running on', device) print("Building Models") print("SoftMax") buildModel(SoftMax(), 0.1, True, True) print("MLP") buildModel(MLP(), 0.1, True, True) print("DAE") buildDAELayer(DAELayer(10304, 1000), lRate=1e-4, epochs=5000, plot=True) buildDAELayer(DAELayer(1000, 300), lRate=1e-4, epochs=5000, plot=True) buildDAESoftmaxModel(DAESoftMax(), lRate=1e-2, epochs=1000, plot=True) print("CNN") buildModel(CNN(), 0.001, True, True) print("\nModel(s) is reconstructed with alpha =", 5000, "beta =", 100, "gamma =", 0.01, "delta =", 0.1) print("Attacking Models") # SoftMax Model from paper print("Softmax") reconstructionAttack(SoftMax()) # MLP Model from paper print("MLP") reconstructionAttack(MLP()) # DAE Model from paper print("DAE") reconstructionAttack(DAESoftMax())
import sys, os sys.path.append("..") sys.path.extend([ os.path.join(root, name) for root, dirs, _ in os.walk("../") for name in dirs ]) from _config import NNConfig from networks import CNN, LSTM, Encoder, Decoder nnconfig = NNConfig() nnconfig.show() cnn = CNN("cnn_layer1") lstm = LSTM("lstm_layer1") cnn.show() lstm.show() encoder = Encoder(cnn) decoder = Decoder(lstm)
import random from networks import CNN device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Network File path = "checkpoints/%s.pt" % sys.argv[1] # Environment env = gym.make("Breakout-v0") env = wrappers.make_env(env) # Network q_network = CNN(4, env.action_space.n).to(device) q_network.load_state_dict(torch.load(path, map_location=torch.device('cpu'))) q_network.eval() # Greedy Policy @torch.no_grad() def select_action(q, state): if random.random() < 0.05: print('Random') return env.action_space.sample() k = q(state.to(device)) print(k) return k.max(1)[1].view(1, 1)
from networks import CNN, SoftMax, MLP from modules import buildModel, reconstructionAttack print("Building Models") print("SoftMax") buildModel(SoftMax(), 0.1, True, True) print("MLP") buildModel(MLP(), 0.1, True, True) print("DAE") buildDAELayer(DAELayer(10304, 1000), lRate=1e-4, epochs=5000, plot=True) buildDAELayer(DAELayer(1000, 300), lRate=1e-4, epochs=5000, plot=True) buildDAESoftmaxModel(DAESoftMax(), lRate=1e-2, epochs=1000, plot=True) print("CNN") buildModel(CNN(), 0.001, True, True)
if args.headless: # GPU optimization mpl.use('Agg') # Suppress rendering import matplotlib.pyplot as plt env = gym.make(args.env) if not args.toy: env = FrameStackWrapper(env, args.frames) env = ResetLifeLostWrapper(env) epsilon_max = args.epsilon_max epsilon_min = args.epsilon_min epsilon_decay = args.epsilon_decay eps_threshold = 1 total_moves = 0 net = CNN(conv_channels=args.frames, n_actions=env.action_space.n) if not args.toy \ else FCN(n_actions=env.action_space.n) if chainer.cuda.available: net.to_gpu() optim = optimizers.RMSpropGraves(lr=args.alpha, momentum=args.momentum) # optim = optimizers.RMSprop(lr=args.alpha) # optim = optimizers.Adam(alpha=args.alpha) optim.setup(net) if not os.path.exists("results/{}".format(args.env)): os.makedirs("results/{}".format(args.env)) train()
class Mind: def __init__(self, n_actions, input_shape, save_path=None, load_path=None, action_inverses={}, update_interval=256 * 2, save_interval=5): self.n_actions = n_actions self.save_path = save_path self.network = CNN(n_out=n_actions, input_shape=input_shape) if load_path != None: self.network.load(load_path) self.n_features = input_shape self.data = [] self.current_episode_count = 1 self.random_actions = 0 self.last_action = None self.last_action_random = False self.action_inverses = action_inverses self.lifetime = 1 self.update_interval = update_interval self.save_interval = save_interval self.n_updates = 1 def q(self, state): return self.network.predict(np.expand_dims(np.array(state), axis=0))[0] def should_explore(self, state): if np.random.random() < 1000 / (1000 + self.lifetime): return True return False def explore_action(self, state): return np.random.randint(0, self.n_actions) def action(self, state): q = self.q(state) # if self.last_action_random: # if self.last_action in self.action_inverses: # q[self.action_inverses[self.last_action]] = float('-inf') action = np.argmax(q) if self.should_explore(state): self.random_actions += 1 action = self.explore_action(state) self.last_action_random = True else: self.last_action_random = False if self.lifetime % self.update_interval == 0: self.update(alpha=0.9) self.n_updates += 1 if self.n_updates % self.save_interval == 0: if self.save_path != None: self.save(self.save_path) print('saved') self.last_action = action self.current_episode_count += 1 self.lifetime += 1 return action def save(self, path): self.network.save(path) def reset(self): self.count = 1 print('Random actions: ', self.random_actions) self.random_actions = 0 def q_target(self, reward, best_next, alpha): return reward + alpha * best_next def feedback(self, old_action, old_state, reward, new_state): self.data.append({ 'Q_max': np.max(self.q(new_state)), 'reward': reward, 'old_state': old_state, 'old_action': old_action }) def update(self, alpha=0.6): np.random.shuffle(self.data) samples = self.data self.data = [] states = [] ys = [] for sample in samples: y = self.q(sample['old_state']) y[sample['old_action']] = self.q_target(sample['reward'], best_next=sample['Q_max'], alpha=alpha) #y[sample['old_action']] = sarsa_target(sample['reward'], next_action = sample['Q_max'], alpha = alpha) states.append(sample['old_state']) ys.append(y) self.network.train(np.array(states), np.array(ys))
class ReinforcementLearner: __metaclass__ = abc.ABCMeta lock = threading.Lock() # rl_method: 강화학습 기법을 의미, 이 값은 하위 클래스에 따라 달라진다. (DQNLener는 dq, A2CLener는 ac 등) # stock_code: 학습을 진행하는 주식 종목 코드 # chart_data: 주식 일봉 차트(환경에 해당) # training_data: 학습을 위해서 전처리된 데이터 # min_trading_unit: 투자 최소 단위 # max_trading_unit: 투자 최대 단위 # delayed_reward_threshold: 지연 보상 임계값, 수익 or 손실률이 임계값보다 크면 지연 보상이 발생 # mini_batch_size: ?? # net: 신경망 종류, 이 값에 따라서, 가치 신경망, 정챙신경망으로 사용할 신경망 클래스가 달라짐 # n_steps: LSTM, CNN 신경망에서 사용하는 샘플 묶음 크기 # lr: (learn rate?), 학습 속도, 너무 크면 학습이 진행 안되고, 너무 작으면 학습이 오래 걸림 # value_network, policy_network: 값을 들어오는 경우, 해당 모델을 가치 신경망, 정책신경망으로 사용 # output_path: 학습 과정에서 발생하는 로그, 가시화 결과 및 학습 종료 후 저장되는 신겯망 모델 파일의 저장 위치 결정 def __init__(self, rl_method='rl', stock_code=None, chart_data=None, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, net='dnn', num_steps=1, lr=0.001, value_network=None, policy_network=None, output_path='', reuse_models=True): # 인자 확인 assert min_trading_unit > 0 assert max_trading_unit > 0 assert max_trading_unit >= min_trading_unit assert num_steps > 0 assert lr > 0 # 강화학습 기법 설정 self.rl_method = rl_method # 환경 설정 self.stock_code = stock_code # 강화학습 대상이 되는 주식 종목 코드 self.chart_data = chart_data # 주식 종목의 차트 데이터 self.environment = Environment(chart_data) # 강화학습 환경 객체 # 에이전트 설정 self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) # 학습 데이터 self.training_data = training_data self.sample = None self.training_data_idx = -1 # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기 self.num_features = self.agent.STATE_DIM if self.training_data is not None: self.num_features += self.training_data.shape[1] # 신경망 클래스 객체는, 본 클래스의 하위 클래스에서 생성 # 신경망 설정 self.net = net self.num_steps = num_steps self.lr = lr self.value_network = value_network # 가치 신경망 self.policy_network = policy_network # 정책 신경망 self.reuse_models = reuse_models # 가시화 모듈 self.visualizer = Visualizer() # 메모리 # 강화 학습 과정에서 발생하는 각종 데이터를 쌓아두기 위해서, memory라는 변수 정의 self.memory_sample = [] # 학습 데이터 샘플 self.memory_action = [] # 수행한 행동 self.memory_reward = [] # 획득한 보상 self.memory_value = [] # 행동의 예측 가치 self.memory_policy = [] # 핻동의 예측?확률? self.memory_pv = [] # 포트폴리오 가치 self.memory_num_stocks = [] # 보유 주식수 self.memory_exp_idx = [] # 탐험 위치 self.memory_learning_idx = [] # 학습 위치 # 에포크 관련 정보 self.loss = 0. # 손실 self.itr_cnt = 0 # 수익발생 횟수 self.exploration_cnt = 0 # 탐험 횟수 self.batch_size = 0 # 배치 크기? self.learning_cnt = 0 # 학습 횟수 # 로그 등 출력 경로 self.output_path = output_path # 가치 신경망 생성 함수 # 팩토리 함수 느낌 # 가치 신경망은, 손익율을 회귀분석하는 모델로 보면 된다. 따라서, activation은 선형, loss는 mse이다.??? def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): # reuse_models이 True이고, value_network_path 값이 있으면 신경망 모델 파일을 불러온다... self.value_network.load_model(model_path=self.value_network_path) # 정책 신경망 생성 함수 # activation이 sigmoid로 다르다. # 정책신경망은 PV을 높이기 위해 취하기 좋은 행동에 대한 '분류' 모델 # 활성 함수로 sigmoid을 써서 0 ~ 1 시아의 값으로 확률로 사용하기 위함 def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.policy_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path) # 초기화 함수 # 에포크 초기화 함수 # 에포크마다 데이터가 새로 쌓이는 변수들을 초기화 한다. def reset(self): self.sample = None # 읽어온 학습 데이터가 샘플에 할당됨(초기화에선 None) self.training_data_idx = -1 # 학습 데이터를 처음부터 다시 읽기위해서 -1로 설정 # 환경 초기화 self.environment.reset() # 환경클래스의 reset호출 # 에이전트 초기화 self.agent.reset() # 에이전트가 제공하는 reset호출 # 가시화 초기화 self.visualizer.clear([0, len(self.chart_data)]) # 가시화 클래스의 clear호출 # 메모리 초기화 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 초기화 self.loss = 0. # 신경망의 결과가 학습 데이터와 얼마나 차이가 있는지를 저장하는 변수 loss가 줄어야 좋은거임! self.itr_cnt = 0 # 수행한 에포크 수를 저장 self.exploration_cnt = 0 # 탐험 수 저장, epsilon이 0.1dlrh 100번 투자 결정이 있다고 한다면 약 10번의 무작위 투자 self.batch_size = 0 # 학습할 미니 배치 크기 self.learning_cnt = 0 # 한 에포크 동안 수행한 미니 배치 학습 횟수 # 환경 객체에서 샘플을 획득하는 함수 # 학습 데이터플 구성하는 샘플 하나를 생성하는 함수 def build_sample(self): self.environment.observe() # 차트 데이터의 현재 인덱스에서, 다음 인덱스 데이터를 읽게한다. if len(self.training_data ) > self.training_data_idx + 1: # 학습 데이터 존재 확인 self.training_data_idx += 1 self.sample = self.training_data.iloc[ self.training_data_idx].tolist() # 샘플 가져옴, 샘플은 26개의 값임 self.sample.extend( self.agent.get_states()) # 에이전트에서 2개 값을 추가! (28개값!) return self.sample return None # 배치 학습 데이터 생성 함수, 추상 메소드로 하위 클래스가 반드시 구현해야 한다.! @abc.abstractmethod def get_batch(self, batch_size, delayed_reward, discount_factor): pass # 가치 신경망 및 정책 신경망 학습 함수 # get_batch을 호출해서 배치 학습 데이터를 생성 # 가치 신경망 및 정책 신경망의 train_on_batch을 호출하여, 학습 시킴 # 가치 신경망: DQN, AC, A2C # 정책 신경망: PolicyGradient, AC, A2C def update_networks(self, batch_size, delayed_reward, discount_factor): # 배치 학습 데이터 생성 x, y_value, y_policy = self.get_batch(batch_size, delayed_reward, discount_factor) if len(x) > 0: loss = 0 if y_value is not None: # 가치 신경망 갱신 loss += self.value_network.train_on_batch(x, y_value) if y_policy is not None: # 정책 신경망 갱신 loss += self.policy_network.train_on_batch(x, y_policy) return loss # 학습 후 손실 반환 return None # 가치 신경망 및 정책 신경망 학습 요청 함수 # 배치 학습 데이터의 크기를 정하고, update_networks 호출(위함수) # _loss에 로 총 loss을 생성 def fit(self, delayed_reward, discount_factor, full=False): batch_size = len(self.memory_reward) if full \ else self.batch_size # 배치 학습 데이터 생성 및 신경망 갱신 if batch_size > 0: _loss = self.update_networks(batch_size, delayed_reward, discount_factor) if _loss is not None: self.loss += abs(_loss) self.learning_cnt += 1 # 학습 횟수 저장, loss / learning_cnt하면 에포크의 학습 손실을 알 수 있음 self.memory_learning_idx.append( self.training_data_idx) # 학습 위치 저장 self.batch_size = 0 # 에포크 정보 가시화 함수 def visualize(self, epoch_str, num_epoches, epsilon): self.memory_action = [Agent.ACTION_HOLD] \ * (self.num_steps - 1) + self.memory_action self.memory_num_stocks = [0] * (self.num_steps - 1) \ + self.memory_num_stocks if self.value_network is not None: self.memory_value = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_value if self.policy_network is not None: self.memory_policy = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_policy self.memory_pv = [self.agent.initial_balance] \ * (self.num_steps - 1) + self.memory_pv self.visualizer.plot( epoch_str=epoch_str, num_epoches=num_epoches, epsilon=epsilon, action_list=Agent.ACTIONS, actions=self.memory_action, num_stocks=self.memory_num_stocks, outvals_value=self.memory_value, outvals_policy=self.memory_policy, exps=self.memory_exp_idx, learning_idxes=self.memory_learning_idx, initial_balance=self.agent.initial_balance, pvs=self.memory_pv, ) self.visualizer.save( os.path.join(self.epoch_summary_dir, 'epoch_summary_{}.png'.format(epoch_str))) # 강화학습 수행 함수 # 핵심 함수! def run( self, num_epoches=100, # 총 수행할 반복 학습 횟수, 너무 크면 학습에 걸리는 시간이 길어짐 balance=10000000, # 초기 투자금 discount_factor=0.9, # 상태-행동 가치를 구할때 적용할 할인율, 과거로 갈수록 현재 보상을 약하게 적용한다. start_epsilon=0.5, # 초기 탐험 비율 learning=True # 학습을 마치면 학습된 가치 신경망모델, 정책 신경망 모델이 생성된다. 이런 신경망 모델으 만들꺼면 True, 이미 학습된 모델로, 투자 시뮬레이션일때는 False ): info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \ "DF:{discount_factor} TU:[{min_trading_unit}," \ "{max_trading_unit}] DRT:{delayed_reward_threshold}".format( code=self.stock_code, rl=self.rl_method, net=self.net, lr=self.lr, discount_factor=discount_factor, min_trading_unit=self.agent.min_trading_unit, max_trading_unit=self.agent.max_trading_unit, delayed_reward_threshold=self.agent.delayed_reward_threshold ) with self.lock: logging.info(info) # 강화 학습의 설정값을 로깅 한다. # 시작 시간 time_start = time.time() # 가시화 준비 # 차트 데이터는 변하지 않으므로 미리 가시화 self.visualizer.prepare(self.environment.chart_data, info) # 가시화 결과 저장할 폴더 준비 # epoch_summary_ 라는 폴더에 저장 self.epoch_summary_dir = os.path.join( self.output_path, 'epoch_summary_{}'.format(self.stock_code)) if not os.path.isdir(self.epoch_summary_dir): os.makedirs(self.epoch_summary_dir) else: for f in os.listdir(self.epoch_summary_dir): os.remove(os.path.join(self.epoch_summary_dir, f)) # 에이전트 초기 자본금 설정 self.agent.set_balance(balance) # 학습에 대한 정보 초기화 max_portfolio_value = 0 epoch_win_cnt = 0 # 학습 반복 for epoch in range(num_epoches): time_start_epoch = time.time() # step 샘플을 만들기 위한 큐 # deque사용 - 참고: https://opensourcedev.tistory.com/3 q_sample = collections.deque(maxlen=self.num_steps) # 환경, 에이전트, 신경망, 가시화, 메모리 초기화 self.reset() # 학습을 진행할 수록 탐험 비율 감소 if learning: epsilon = start_epsilon \ * (1. - float(epoch) / (num_epoches - 1)) self.agent.reset_exploration() else: epsilon = start_epsilon self.agent.reset_exploration(alpha=0) while True: # 샘플 생성 next_sample = self.build_sample() if next_sample is None: break # 샘플 만큼 while문 반복 # num_steps만큼 샘플 저장 q_sample.append(next_sample) if len(q_sample) < self.num_steps: continue # 가치, 정책 신경망 예측 # 각 신경망의 predict함수 호출 pred_value = None pred_policy = None if self.value_network is not None: pred_value = self.value_network.predict(list(q_sample)) if self.policy_network is not None: pred_policy = self.policy_network.predict(list(q_sample)) # 신경망 또는 탐험에 의한 행동 결정 # 행동, 결정에 대한 확신도, 무작위 탐험 유무 action, confidence, exploration = \ self.agent.decide_action( pred_value, pred_policy, epsilon) # 결정한 행동을 수행하고 즉시 보상과 지연 보상 획득 immediate_reward, delayed_reward = \ self.agent.act(action, confidence) # 행동 및 행동에 대한 결과를 기억 self.memory_sample.append(list(q_sample)) self.memory_action.append(action) self.memory_reward.append(immediate_reward) if self.value_network is not None: self.memory_value.append(pred_value) if self.policy_network is not None: self.memory_policy.append(pred_policy) self.memory_pv.append(self.agent.portfolio_value) self.memory_num_stocks.append(self.agent.num_stocks) if exploration: self.memory_exp_idx.append(self.training_data_idx) # 반복에 대한 정보 갱신 self.batch_size += 1 self.itr_cnt += 1 self.exploration_cnt += 1 if exploration else 0 # 3항연산??? # 지연 보상 발생된 경우 미니 배치 학습 # 지연보상은 지연보상 임계치가 넘는 손익률이 발생하면 주어진다. if learning and (delayed_reward != 0): self.fit(delayed_reward, discount_factor) # 에포크 종료 후 학습 (while문 이후 미니 배치 학습) if learning: self.fit(self.agent.profitloss, discount_factor, full=True) # 에포크 관련 정보 로그 기록 num_epoches_digit = len(str(num_epoches)) epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0') # 문자열을 자리수에 맞게 정렬(우측) 함수 time_end_epoch = time.time() elapsed_time_epoch = time_end_epoch - time_start_epoch if self.learning_cnt > 0: self.loss /= self.learning_cnt logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} " "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} " "#Stocks:{} PV:{:,.0f} " "LC:{} Loss:{:.6f} ET:{:.4f}".format( self.stock_code, epoch_str, num_epoches, epsilon, self.exploration_cnt, self.itr_cnt, self.agent.num_buy, self.agent.num_sell, self.agent.num_hold, self.agent.num_stocks, self.agent.portfolio_value, self.learning_cnt, self.loss, elapsed_time_epoch)) # 에포크 관련 정보 가시화 self.visualize(epoch_str, num_epoches, epsilon) # 학습 관련 정보 갱신 max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # 종료 시간 time_end = time.time() elapsed_time = time_end - time_start # 학습 관련 정보 로그 기록 with self.lock: logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} " "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format( code=self.stock_code, elapsed_time=elapsed_time, max_pv=max_portfolio_value, cnt_win=epoch_win_cnt)) # 가치 신경망 및 정책 신경망 저장 함수 def save_models(self): if self.value_network is not None and \ self.value_network_path is not None: self.value_network.save_model(self.value_network_path) if self.policy_network is not None and \ self.policy_network_path is not None: self.policy_network.save_model(self.policy_network_path)
if model == 'all': # SoftMax Model from paper print("Softmax") reconstructionAttack(SoftMax(), alpha, beta, gamma, delta, True, False) # MLP Model from paper print("MLP") reconstructionAttack(MLP(), alpha, beta, gamma, delta, True, False) # DAE Model from paper print("DAE") reconstructionAttack(DAESoftMax(), alpha, beta, gamma, delta, True, False) # CNN for comparison print("CNN") reconstructionAttack(CNN(), alpha, beta, gamma, delta, True, False) else: if model == 'Softmax': # SoftMax Model from paper print("Softmax") reconstructionAttack(SoftMax(), alpha, beta, gamma, delta, True, False) if model == 'MLP': # MLP Model from paper print("MLP") reconstructionAttack(MLP(), alpha, beta, gamma, delta, True, False) if model == 'DAE': # DAE Model from paper print("DAE") reconstructionAttack(DAESoftMax(), alpha, beta, gamma, delta, True,