def sample_mv_percents(self, phase): main_mv_percents = [] for s in stock_symbols: start_date, end_date = self._get_start_end_date(phase) stock_mv_path = os.path.join(str(self.movement_path), '{}.txt'.format(s)) main_target_dates = [] with open(stock_mv_path, 'r') as movement_f: for line in movement_f: data = line.split('\t') main_target_date = datetime.strptime(data[0], '%Y-%m-%d').date() main_target_date_str = main_target_date.isoformat() if start_date <= main_target_date_str < end_date: main_target_dates.append(main_target_date) for main_target_date in main_target_dates: prices_and_ts = self._get_prices_and_ts(s, main_target_date) if not prices_and_ts: continue main_mv_percents.append(prices_and_ts['main_mv_percent']) logger.info('finished: {}'.format(s)) return main_mv_percents
def assemble_graph(self): logger.info('Start graph assembling...') with tf.device('/device:GPU:0'): self._build_placeholders() self._build_embeds() self._build_mie() self._build_vmd() self._build_temporal_att() self._build_ata() self._create_optimizer()
def unit_test_train(self): with tf.Session() as sess: word_table_init = self.pipe.init_word_table() feed_table_init = {self.model.word_table_init: word_table_init} sess.run(tf.global_variables_initializer(), feed_dict=feed_table_init) logger.info('Word table init: done!') logger.info('Model: {0}, start a new session!'.format( self.model.model_name)) n_iter = self.model.global_step.eval() # forward train_batch_loss_list = list() train_epoch_size = 0.0 train_epoch_n_acc = 0.0 train_batch_gen = self.pipe.batch_gen(phase='train') train_batch_dict = next(train_batch_gen) while n_iter < 100: feed_dict = { self.model.is_training_phase: True, self.model.batch_size: train_batch_dict['batch_size'], self.model.stock_ph: train_batch_dict['stock_batch'], self.model.T_ph: train_batch_dict['T_batch'], self.model.n_words_ph: train_batch_dict['n_words_batch'], self.model.n_msgs_ph: train_batch_dict['n_msgs_batch'], self.model.y_ph: train_batch_dict['y_batch'], self.model.price_ph: train_batch_dict['price_batch'], self.model.mv_percent_ph: train_batch_dict['mv_percent_batch'], self.model.word_ph: train_batch_dict['word_batch'], self.model.ss_index_ph: train_batch_dict['ss_index_batch'], } ops = [ self.model.y_T, self.model.y_T_, self.model.loss, self.model.optimize ] train_batch_y, train_batch_y_, train_batch_loss, _ = sess.run( ops, feed_dict) # training batch stat train_epoch_size += float(train_batch_dict['batch_size']) train_batch_loss_list.append(train_batch_loss) train_batch_n_acc = sess.run( metrics.n_accurate(y=train_batch_y, y_=train_batch_y_)) train_epoch_n_acc += float(train_batch_n_acc) stat_logger.print_batch_stat(n_iter, train_batch_loss, train_batch_n_acc, train_batch_dict['batch_size']) n_iter += 1
def restore_and_test(self): with tf.Session(config=self.tf_config) as sess: checkpoint = tf.train.get_checkpoint_state(os.path.dirname(self.model.tf_checkpoint_file_path)) if checkpoint and checkpoint.model_checkpoint_path: logger.info('Model: {0}, session restored!'.format(self.model.model_name)) self.saver.restore(sess, checkpoint.model_checkpoint_path) else: logger.info('Model: {0}: NOT found!'.format(self.model.model_name)) raise IOError res = self.generation(sess, phase='test') stat_logger.print_eval_res(res)
def init_word_table(self): word_table_init = np.random.random( (vocab_size, self.word_embed_size)) * 2 - 1 # [-1.0, 1.0] if self.word_embed_type is not 'rand': n_replacement = 0 vocab_id_dict = self.index_token(vocab, key='token') with io.open(self.glove_path, 'r', encoding='utf-8') as f: for line in f: tuples = line.split() word, embed = tuples[0], [ float(embed_col) for embed_col in tuples[1:] ] if word in ['<unk>', 'unk']: # unify UNK word = 'UNK' if word in vocab_id_dict: n_replacement += 1 word_id = vocab_id_dict[word] word_table_init[word_id] = embed logger.info( 'ASSEMBLE: word table #replacement: {}'.format(n_replacement)) return word_table_init
def train_and_dev(self): with tf.Session(config=self.tf_config) as sess: # prep: writer and init writer = tf.summary.FileWriter(self.model.tf_graph_path, sess.graph) # init all vars with tables feed_table_init = {self.model.word_table_init: self.pipe.init_word_table()} sess.run(tf.global_variables_initializer(), feed_dict=feed_table_init) logger.info('Word table init: done!') # prep: checkpoint checkpoint = tf.train.get_checkpoint_state(os.path.dirname(self.model.tf_checkpoint_file_path)) if checkpoint and checkpoint.model_checkpoint_path: # restore partial saved vars reader = tf.train.NewCheckpointReader(checkpoint.model_checkpoint_path) restore_dict = dict() for v in tf.all_variables(): tensor_name = v.name.split(':')[0] if reader.has_tensor(tensor_name): print('has tensor: {0}'.format(tensor_name)) restore_dict[tensor_name] = v checkpoint_saver = tf.train.Saver(restore_dict) checkpoint_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info('Model: {0}, session restored!'.format(self.model.model_name)) else: logger.info('Model: {0}, start a new session!'.format(self.model.model_name)) for epoch in range(self.model.n_epochs): logger.info('Epoch: {0}/{1} start'.format(epoch+1, self.model.n_epochs)) # training phase train_batch_loss_list = list() epoch_size, epoch_n_acc = 0.0, 0.0 train_batch_gen = self.pipe.batch_gen(phase='train') # a new gen for a new epoch for train_batch_dict in train_batch_gen: # logger.info('train: batch_size: {0}'.format(train_batch_dict['batch_size'])) feed_dict = {self.model.is_training_phase: True, self.model.batch_size: train_batch_dict['batch_size'], self.model.stock_ph: train_batch_dict['stock_batch'], self.model.T_ph: train_batch_dict['T_batch'], self.model.n_words_ph: train_batch_dict['n_words_batch'], self.model.n_msgs_ph: train_batch_dict['n_msgs_batch'], self.model.y_ph: train_batch_dict['y_batch'], self.model.price_ph: train_batch_dict['price_batch'], self.model.mv_percent_ph: train_batch_dict['mv_percent_batch'], self.model.word_ph: train_batch_dict['word_batch'], self.model.ss_index_ph: train_batch_dict['ss_index_batch'], } ops = [self.model.y_T, self.model.y_T_, self.model.loss, self.model.optimize, self.model.global_step] train_batch_y, train_batch_y_, train_batch_loss, _, n_iter = sess.run(ops, feed_dict) # training batch stat epoch_size += float(train_batch_dict['batch_size']) train_batch_loss_list.append(train_batch_loss) # list of floats train_batch_n_acc = sess.run(metrics.n_accurate(y=train_batch_y, y_=train_batch_y_)) # float epoch_n_acc += float(train_batch_n_acc) # save model and generation if n_iter >= self.silence_step and n_iter % self.skip_step == 0: stat_logger.print_batch_stat(n_iter, train_batch_loss, train_batch_n_acc, train_batch_dict['batch_size']) self.saver.save(sess, self.model.tf_saver_path, n_iter) res = self.generation(sess, phase='dev') stat_logger.print_eval_res(res) # print training epoch stat epoch_loss, epoch_acc = metrics.basic_train_stat(train_batch_loss_list, epoch_n_acc, epoch_size) stat_logger.print_epoch_stat(epoch_loss=epoch_loss, epoch_acc=epoch_acc) writer.close()
def __init__(self): logger.info('INIT: #stock: {0}, #vocab+1: {1}'.format(ss_size, vocab_size)) # model config self.mode = config_model['mode'] self.opt = config_model['opt'] self.lr = config_model['lr'] self.decay_step = config_model['decay_step'] self.decay_rate = config_model['decay_rate'] self.momentum = config_model['momentum'] self.kl_lambda_anneal_rate = config_model['kl_lambda_anneal_rate'] self.kl_lambda_start_step = config_model['kl_lambda_start_step'] self.use_constant_kl_lambda = config_model['use_constant_kl_lambda'] self.constant_kl_lambda = config_model['constant_kl_lambda'] self.daily_att = config_model['daily_att'] self.alpha = config_model['alpha'] self.clip = config_model['clip'] self.n_epochs = config_model['n_epochs'] self.batch_size_for_name = config_model['batch_size'] self.max_n_days = config_model['max_n_days'] self.max_n_msgs = config_model['max_n_msgs'] self.max_n_words = config_model['max_n_words'] self.weight_init = config_model['weight_init'] uniform = True if self.weight_init == 'xavier-uniform' else False self.initializer = tf.contrib.layers.xavier_initializer(uniform=uniform) self.bias_initializer = tf.constant_initializer(0.0, dtype=tf.float32) self.word_embed_type = config_model['word_embed_type'] self.y_size = config_model['y_size'] self.word_embed_size = config_model['word_embed_size'] self.stock_embed_size = config_model['stock_embed_size'] self.price_embed_size = config_model['word_embed_size'] self.mel_cell_type = config_model['mel_cell_type'] self.variant_type = config_model['variant_type'] self.vmd_cell_type = config_model['vmd_cell_type'] self.vmd_rec = config_model['vmd_rec'] self.mel_h_size = config_model['mel_h_size'] self.msg_embed_size = config_model['mel_h_size'] self.corpus_embed_size = config_model['mel_h_size'] self.h_size = config_model['h_size'] self.z_size = config_model['h_size'] self.g_size = config_model['g_size'] self.use_in_bn= config_model['use_in_bn'] self.use_o_bn = config_model['use_o_bn'] self.use_g_bn = config_model['use_g_bn'] self.dropout_train_mel_in = config_model['dropout_mel_in'] self.dropout_train_mel = config_model['dropout_mel'] self.dropout_train_ce = config_model['dropout_ce'] self.dropout_train_vmd_in = config_model['dropout_vmd_in'] self.dropout_train_vmd = config_model['dropout_vmd'] self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') # model name name_pattern_max_n = 'days-{0}.msgs-{1}-words-{2}' name_max_n = name_pattern_max_n.format(self.max_n_days, self.max_n_msgs, self.max_n_words) name_pattern_input_type = 'word_embed-{0}.vmd_in-{1}' name_input_type = name_pattern_input_type.format(self.word_embed_type, self.variant_type) name_pattern_key = 'alpha-{0}.anneal-{1}.rec-{2}' name_key = name_pattern_key.format(self.alpha, self.kl_lambda_anneal_rate, self.vmd_rec) name_pattern_train = 'batch-{0}.opt-{1}.lr-{2}-drop-{3}-cell-{4}' name_train = name_pattern_train.format(self.batch_size_for_name, self.opt, self.lr, self.dropout_train_mel_in, self.mel_cell_type) name_tuple = (self.mode, name_max_n, name_input_type, name_key, name_train) self.model_name = '_'.join(name_tuple) # paths self.tf_graph_path = os.path.join(path_parser.graphs, self.model_name) # summary self.tf_checkpoints_path = os.path.join(path_parser.checkpoints, self.model_name) # checkpoints self.tf_checkpoint_file_path = os.path.join(self.tf_checkpoints_path, 'checkpoint') # for restore self.tf_saver_path = os.path.join(self.tf_checkpoints_path, 'sess') # for save # verification assert self.opt in ('sgd', 'adam') assert self.mel_cell_type in ('ln-lstm', 'gru', 'basic') assert self.vmd_cell_type in ('ln-lstm', 'gru') assert self.variant_type in ('hedge', 'fund', 'tech', 'discriminative') assert self.vmd_rec in ('zh', 'h') assert self.weight_init in ('xavier-uniform', 'xavier-normal')