def load_file(filePath, dataName='data', mode='pickle'): _logger.add() _logger.add('Trying to load %s from %s' % (dataName, filePath)) data = None ifLoad = False if os.path.isfile(filePath): _logger.add('Have found the file, loading...') if mode == 'pickle': with open(filePath, 'rb') as f: data = pickle.load(f) ifLoad = True elif mode == 'json': with open(filePath, 'r', encoding='utf-8') as f: data = json.load(f) ifLoad = True elif mode == 'txt': pass else: raise (ValueError, 'Function save_file does not have mode % s' % (mode)) else: _logger.add('Have not found the file') _logger.add('Done') return (ifLoad, data)
def build_loss(self): # weight_decay with tf.name_scope('weight_decay'): for var in set(tf.get_collection('reg_vars', self.scope)): # store reg vars weight_decay = tf.multiply( tf.nn.l2_loss(var), cfg.weight_decay, name='{}-weight_decay'.format('-'.join( str(var.op.name).split('/')))) tf.add_to_collection('losses', weight_decay) # store losses reg_vars = tf.get_collection('losses', self.scope) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) _logger.add('regularization var num: %d' % len(reg_vars)) _logger.add('trainable var num: %d' % len(trainable_vars)) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.stop_gradient(self.gold_label), logits=self.logits) tf.add_to_collection('losses', tf.reduce_mean(losses, name='xentropy_loss_mean')) loss = tf.add_n(tf.get_collection('losses', self.scope), name='loss') tf.summary.scalar(loss.op.name, loss) tf.add_to_collection('ema/scalar', loss) return loss
def build_network(self): _logger.add() _logger.add('building %s nerual network structure ...' % cfg.network_type) vocab_len = self.vocab_len max_sent_len = self.max_sent_len word_embedding_len = self.word_embedding_len hn = self.hn batch_size = self.batch_size sent1_len = self.sent1_len #include pad sent2_len = self.sent2_len # --------------------- embedding mat ------------------------------------ with tf.variable_scope('emb'): # token_emb_mat, 0-->empty, 1-->unknown token token_emb_mat = tf.get_variable('token_emb_mat',initializer=tf.constant(self.emb_mat,dtype=tf.float32),dtype=tf.float32,trainable=True) sent1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # batch_size,sent1_len,word_embedding_len sent2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) self.tensor_dict['sent1_emb'] = sent1_emb self.tensor_dict['sent2_emb'] = sent2_emb # -------------------- sentence encoding --------------------------------- with tf.variable_scope('sent_encoding'): sent1_rep = disan(sent1_emb, self.sent1_token_mask, 'DiSAN', cfg.dropout, self.is_train, cfg.weight_decay, 'elu', self.tensor_dict, 'sent1') self.tensor_dict['sent1_rep'] = sent1_rep # batch_size,2*word_embedding_len tf.get_variable_scope().reuse_variables() sent2_rep = disan(sent2_emb, self.sent2_token_mask, 'DiSAN', cfg.dropout, self.is_train, cfg.weight_decay, 'elu', self.tensor_dict, 'sent2') self.tensor_dict['sent2_rep'] = sent2_rep # -------------------- output --------------------------------------------- with tf.variable_scope('output'): # batch_size, 4*2*word_embedding_len out_rep = tf.concat([sent1_rep, sent2_rep, sent1_rep-sent2_rep, sent1_rep*sent2_rep], axis=-1) pre_output = tf.nn.elu(linear([out_rep], hn, True, 0., scope='pre_output', squeeze=False, weight_decay=cfg.weight_decay, input_keep_prob=cfg.dropout, is_train=self.is_train)) logits = linear([pre_output], self.output_class, True, 0., scope='logits', squeeze=False, weight_decay=cfg.weight_decay, input_keep_prob=cfg.dropout, is_train=self.is_train) self.tensor_dict['logits'] = logits return logits
def update_tensor_add_ema_and_opt(self): self.logits = self.build_network() self.loss = self.build_loss() self.accuracy, self.accuracy_0, self.accuracy_1 = self.build_accuracy() # -------------------------- ema --------------------------TODO if True: self.var_ema = tf.train.ExponentialMovingAverage(cfg.var_decay) self.build_var_ema() if cfg.mode == 'train': self.ema = tf.train.ExponentialMovingAverage(cfg.decay) self.build_ema() self.summary = tf.summary.merge_all() # -------------------------- optimization ---------------- if cfg.optimizer.lower() == 'adadelta': self.opt = tf.train.AdadeltaOptimizer(self.learning_rate) elif cfg.optimizer.lower() == 'adam': self.opt = tf.train.AdamOptimizer(self.learning_rate) elif cfg.optimizer.lower() == 'rmsprop': self.opt = tf.train.RMSPropOptimizer(self.learning_rate) else: raise AttributeError('no optimizer named as \'%s\' ' % cfg.optimizer) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) # trainable param num: # print params num: all_params_num = 0 for elem in trainable_vars: # elem.name var_name = elem.name.split(':')[0] if var_name.endswith('emb_mat'): continue # emb_mat para excluded here params_num = 1 for l in elem.get_shape().as_list(): params_num *= l all_params_num += params_num _logger.add('Trainable parameters number: %d: ' % all_params_num) self.train_op = self.opt.minimize(self.loss, self.global_step, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, self.scope))
def __init__(self, data_file_path, data_type, dicts=None, language_type='es', unlabeled_file_path=None, emb_file_path=None): self.data_type = data_type _logger.add('building data set object for %s' % data_type) assert data_type in ['train', 'dev', 'test', 'infer'] # check if data_type in ['dev', 'test', 'infer']: assert dicts is not None # build vocab --> deprecated --> dicts is preprocessed # if data_type == 'train': # assert unlabeled_file_path is not None # self.dicts = {} # vocab_es_count, vocab_es_token2id, vocab_es_id2token, vocab_en_count, vocab_en_token2id, vocab_en_id2token=build_vocab(unlabeled_file_path) # self.dicts['es'] = vocab_es_token2id # self.dicts['en'] = vocab_en_token2id # else: # self.dicts = dicts self.dicts = dicts # data if data_type == 'train': self.nn_data = load_en_train_data(data_file_path, self.dicts['es'], self.dicts['en']) elif data_type == 'dev': self.nn_data = load_en_train_data(data_file_path, self.dicts['es'], self.dicts['en']) elif data_type == 'test': self.nn_data = load_es_train_data(data_file_path, self.dicts['es'], self.dicts['en']) elif data_type == 'infer': self.nn_data = load_es_test_data(data_file_path, self.dicts['es']) self.sample_num = len(self.nn_data) # generate embedding if data_type == 'train': if language_type == 'en': self.emb_mat_token = load_emb_mat(emb_file_path, self.dicts['en']) elif language_type == 'es': self.emb_mat_token = load_emb_mat(emb_file_path, self.dicts['es'])
def get_inference(self, sess, dataset_obj): _logger.add() _logger.add('getting inference result for %s' % dataset_obj.data_type) logits_list = [] prob_list = [] for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter(): feed_dict = self.model.get_feed_dict(sample_batch, 'infer') logits = sess.run(self.model.logits, feed_dict=feed_dict) logits_list.append(np.argmax(logits, -1)) prob_list.append( np.exp(logits[:, 1]) / (np.exp(logits[:, 0]) + np.exp(logits[:, 1]))) logits_array = np.concatenate(logits_list, 0) prob_array = np.concatenate(prob_list, 0) return logits_array, prob_array
def save_file(data, filePath, dataName='data', mode='pickle'): _logger.add() _logger.add('Saving %s to %s' % (dataName, filePath)) if mode == 'pickle': with open(filePath, 'wb') as f: pickle.dump(obj=data, file=f) elif mode == 'json': with open(filePath, 'w', encoding='utf-8') as f: json.dump(obj=data, fp=f) elif mode == 'txt': pass else: raise (ValueError, 'Fuction save_file does not have mode %s' % (mode)) _logger.add('Done')
def train(): # output_model_params() # need to fixed, reusability of data loadFile = True ifLoad, data = False, None loaddict, dicts = load_file(cfg.dict_path, 'dict', 'pickle') if not loaddict: raise (ValueError, 'dict load failed') if loadFile: ifLoad, data = load_file(cfg.processed_path, 'processed data', 'pickle') if not ifLoad or not loadFile: train_data_obj = Dataset(cfg.train_data_path, 'train', dicts=dicts, language_type='es', unlabeled_file_path=cfg.unlabeled_data_path, emb_file_path=cfg.emb_es_path) dev_data_obj = Dataset(cfg.dev_data_path, 'dev', dicts=dicts) test_data_obj = Dataset(cfg.test_data_path, 'test', dicts=dicts) save_file( { 'train_data_obj': train_data_obj, 'dev_data_obj': dev_data_obj, 'test_data_obj': test_data_obj }, cfg.processed_path) # train_data_obj.save_dict(cfg.dict_path) else: train_data_obj = data['train_data_obj'] dev_data_obj = data['dev_data_obj'] test_data_obj = data['test_data_obj'] emb_mat_token = train_data_obj.emb_mat_token with tf.variable_scope(network_type) as scope: if network_type in model_type_set: model = Model(emb_mat_token, len(train_data_obj.dicts['es']), 100, scope.name) # TODO graphHandler = GraphHandler(model) evaluator = Evaluator(model) performRecorder = PerformRecorder(5) if cfg.gpu_mem < 1: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True) else: gpu_options = tf.GPUOptions() graph_config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) sess = tf.Session(config=graph_config) graphHandler.initialize(sess) # load model if cfg.load_model and cfg.load_step: saver = tf.train.Saver() step = cfg.load_step model_path = os.path.join(cfg.ckpt_dir, 'top_result_saver_step_%d.ckpt' % step) saver.restore(sess, model_path) # begin training steps_per_epoch = int( math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size)) num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch global_step = 0 for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter( num_steps): global_step = sess.run(model.global_step) + 1 if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0 loss, summary, train_op = model.step(sess, sample_batch, get_summary=if_get_summary) if global_step % 100 == 0 or global_step == 1: _logger.add('data round: %d: %d/%d, global_step:%d -- loss:%.4f' % (data_round, idx_b, batch_num, global_step, loss)) if if_get_summary: graphHandler.add_summary(summary, global_step) # Occasional evaluation #if global_step > int(cfg.num_steps - 100000) and (global_step % (cfg.eval_period or steps_per_epoch) == 0): if global_step % (cfg.eval_period or steps_per_epoch) == 0: # debug # ---- dev ---- dev_loss, dev_accu, dev_accu_0, dev_accu_1 = evaluator.get_evaluation( sess, dev_data_obj, global_step) _logger.add( '==> for dev, loss: %.4f, accuracy: %.4f, accuracy_0: %.4f, accuracy_1: %.4f' % (dev_loss, dev_accu, dev_accu_0, dev_accu_1)) # ---- test ---- test_loss, test_accu, test_accu_0, test_accu_1 = evaluator.get_evaluation( sess, test_data_obj, global_step) _logger.add( '~~> for test, loss: %.4f, accuracy: %.4f, accuracy_0: %.4f, accuracy_1: %.4f' % (test_loss, test_accu, test_accu_0, test_accu_1)) if global_step > cfg.update_lr_step: model.update_learning_rate(dev_loss, cfg.lr_decay) if global_step > cfg.record_model_step: is_in_top, deleted_step = performRecorder.update_top_list( global_step, test_accu, sess) this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round( data_round) if this_epoch_time is not None and mean_epoch_time is not None: _logger.add('##> this epoch time: %f, mean epoch time: %f' % (this_epoch_time, mean_epoch_time))
def test(): assert cfg.load_path is not None #TODO loadFile = False ifLoad, data = False, None if loadFile: ifLoad, data = load_file(cfg.processed_path, 'processed data', 'pickle') if not ifLoad or not loadFile: train_data_obj = Dataset(cfg.train_data_path, 'train', language_type='es', unlabeled_file_path=cfg.unlabeled_data_path, emb_file_path=cfg.emb_es_path) dev_data_obj = Dataset(cfg.dev_data_path, 'dev', dicts=train_data_obj.dicts) test_data_obj = Dataset(cfg.test_data_path, 'test', dicts=train_data_obj.dicts) save_file( { 'train_data_obj': train_data_obj, 'dev_data_obj': dev_data_obj, 'test_data_obj': test_data_obj }, cfg.processed_path) train_data_obj.save_dict(cfg.dict_path) else: train_data_obj = data['train_data_obj'] dev_data_obj = data['dev_data_obj'] test_data_obj = data['test_data_obj'] emb_mat_token = train_data_obj.emb_mat_token with tf.variable_scope(network_type) as scope: if network_type in model_type_set: model = Model(emb_mat_token, len(train_data_obj.dicts['es']), 100, scope.name) # TODO graphHandler = GraphHandler(model) evaluator = Evaluator(model) graph_config = tf.ConfigProto() sess = tf.Session(config=graph_config) graphHandler.initialize(sess) # ---- dev ---- dev_loss, dev_accu = evaluator.get_evaluation(sess, dev_data_obj, None) _logger.add('==> for dev, loss: %.4f, accuracy: %.4f' % (dev_loss, dev_accu)) # ---- test ---- test_loss, test_accu = evaluator.get_evaluation(sess, test_data_obj, None) _logger.add('~~> for test, loss: %.4f, accuracy: %.4f' % (test_loss, test_accu)) # ---- train ---- train_loss, train_accu = evaluator.get_evaluation(sess, train_data_obj, None) _logger.add('--> for train, loss: %.4f, accuracy: %.4f' % (train_loss, train_accu))
def restore(self, sess): _logger.add() # print(cfg.ckpt_dir) if cfg.load_step is None: if cfg.load_path is None: _logger.add('trying to restore from dir %s' % cfg.ckpt_dir) latest_checkpoint_path = tf.train.latest_checkpoint( cfg.ckpt_dir) else: latest_checkpoint_path = cfg.load_path else: latest_checkpoint_path = cfg.ckpt_path + '-' + str(cfg.load_step) if latest_checkpoint_path is not None: _logger.add('trying to restore from ckpt file %s' % latest_checkpoint_path) try: self.saver.restore(sess, latest_checkpoint_path) _logger.add('success to restore') except tf.errors.NotFoundError: _logger.add('failure to restore') if cfg.mode != 'train': raise FileNotFoundError('cannot find model file') else: _logger.add('No check point file in dir %s' % cfg.ckpt_dir) if cfg.mode != 'train': raise FileNotFoundError('cannot find model file') _logger.done()
def save(self, sess, global_step=None): _logger.add() _logger.add('saving model to %s' % cfg.ckpt_path) self.saver.save(sess, cfg.ckpt_path, global_step) _logger.done()
def add_summary(self, summary, global_step): _logger.add() _logger.add('saving summary...') self.writer.add_summary(summary, global_step) _logger.done()
def get_evaluation(self, sess, dataset_obj, global_step=None): _logger.add() _logger.add('getting evaluation result for %s' % dataset_obj.data_type) logits_list, loss_list, accu_list, accu_0_list, accu_1_list, gold_label_list = [], [], [], [], [], [] for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter(): feed_dict = self.model.get_feed_dict(sample_batch, 'dev') logits, loss, accu, accu_0, accu_1, gold_label = sess.run([self.model.logits,self.model.loss, self.model.accuracy, self.model.accuracy_0, self.model.accuracy_1, self.model.gold_label], feed_dict=feed_dict) logits_list.append(np.argmax(logits, -1)) loss_list.append(loss) accu_list.append(accu) accu_0_list.append(accu_0) accu_1_list.append(accu_1) gold_label_list.append(gold_label) logits_array = np.concatenate(logits_list, 0) loss_value = np.mean(loss_list) accu_array = np.concatenate(accu_list, 0) accu_value = np.mean(accu_array) accu_0_array = np.concatenate(accu_0_list, 0) accu_0_value = np.mean(accu_0_array) accu_1_array = np.concatenate(accu_1_list, 0) accu_1_value = np.mean(accu_1_array) gold_label_array = np.concatenate(gold_label_list, 0) count_gold_label = gold_label_array.size count_gold_label_1 = np.count_nonzero(gold_label_array) count_gold_label_0 = count_gold_label - count_gold_label_1 accu_0_value = accu_0_value * count_gold_label / count_gold_label_0 accu_1_value = accu_1_value * count_gold_label / count_gold_label_1 if global_step is not None: if dataset_obj.data_type == 'train': summary_feed_dict = { self.train_loss: loss_value, self.train_accuracy: accu_value, self.train_accuracy_0: accu_0_value, self.train_accuracy_1: accu_1_value } summary = sess.run(self.train_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) elif dataset_obj.data_type == 'dev': summary_feed_dict = { self.dev_loss: loss_value, self.dev_accuracy: accu_value, self.dev_accuracy_0: accu_0_value, self.dev_accuracy_1: accu_1_value } summary = sess.run(self.dev_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) else: summary_feed_dict = { self.test_loss: loss_value, self.test_accuracy: accu_value, self.test_accuracy_0: accu_0_value, self.test_accuracy_1: accu_1_value } summary = sess.run(self.test_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) return loss_value, accu_value, accu_0_value, accu_1_value