def __init__(self, input_dim=None, output_dim=1, init_path=None, opt_algo='gd', learning_rate=1e-2, l2_weight=0, random_seed=None): Model.__init__(self) init_vars = [('w', [input_dim, output_dim], 'xavier', dtype), ('b', [output_dim], 'zero', dtype)] self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = tf.sparse_placeholder(dtype) self.y = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) # 初始化变量w, b w = self.vars['w'] b = self.vars['b'] xw = tf.sparse_tensor_dense_matmul(self.X, w) logits = tf.reshape(xw + b, [-1]) self.y_prob = tf.sigmoid(logits) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=logits)) + \ l2_weight * tf.nn.l2_loss(xw) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None, embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None): Model.__init__(self) init_vars = [] num_inputs = len(field_sizes) print('num_inputs:{0}\\t\tlayer_size:{1}'.format(num_inputs, layer_sizes)) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype)) # 为每个特征值初始化一个长度为10的向量 node_in = num_inputs * embed_size # 将每个特征embeding 为10维的向量, 总共16个特征,所以是160个输入 网络为[160, 500, 1] for i in range(len(layer_sizes)): init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype)) init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype)) node_in = layer_sizes[i] print('init_vars:', init_vars) self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1) # 将每个特征的隐含向量连起来,组成网络的输入,160维 l = xw for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] bi = self.vars['b%d' % i] print('第{0}个隐藏层l.shape, wi.shape, bi.shape'.format(i), l.shape, wi.shape, bi.shape) l = tf.nn.dropout( utils.activate( tf.matmul(l, wi) + bi, layer_acts[i]), self.layer_keeps[i]) l = tf.squeeze(l) # 从tensor中删除所有大小是1的维度 self.y_prob = tf.sigmoid(l) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y)) if layer_l2 is not None: self.loss += embed_l2 * tf.nn.l2_loss(xw) for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] self.loss += layer_l2[i] * tf.nn.l2_loss(wi) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def __init__(self, input_dim=None, output_dim=1, factor_order=10, init_path=None, opt_algo='gd', learning_rate=1e-2, l2_w=0, l2_v=0, random_seed=None): Model.__init__(self) init_vars = [('w', [input_dim, output_dim], 'xavier', dtype), ('v', [input_dim, factor_order], 'xavier', dtype), ('b', [output_dim], 'zero', dtype)] self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = tf.sparse_placeholder(dtype) self.y = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w = self.vars['w'] v = self.vars['v'] b = self.vars['b'] X_square = tf.SparseTensor(self.X.indices, tf.square(self.X.values), tf.to_int64(tf.shape(self.X))) xv = tf.square(tf.sparse_tensor_dense_matmul(self.X, v)) p = 0.5 * tf.reshape( tf.reduce_sum(xv - tf.sparse_tensor_dense_matmul(X_square, tf.square(v)), 1), [-1, output_dim]) xw = tf.sparse_tensor_dense_matmul(self.X, w) logits = tf.reshape(xw + b + p, [-1]) self.y_prob = tf.sigmoid(logits) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.y)) + \ l2_w * tf.nn.l2_loss(xw) + \ l2_v * tf.nn.l2_loss(xv) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def __init__(self, train_loader, test_loader, embed_size=10, layer_size=None, layer_act=None, layer_keeps=None, opt_algo='gd', learning_rate=0.01, epoch=10, early_stop_round=None, l2=None, random_seed=None): self.graph = tf.Graph() self.train_loader = train_loader self.test_loader = test_loader self.embed_size = embed_size self.layer_size = layer_size self.layer_act = layer_act self.layer_keeps = layer_keeps self.num_fields = len(config.FIELD_SIZES) self.var_list = [] for idx in range(self.num_fields): self.var_list.append([ 'embed_{}'.format(idx), [config.FIELD_SIZES[idx], self.embed_size], 'xavier' ]) in_size = self.num_fields * self.embed_size for idx in range(len(layer_size)): self.var_list.append( ['w_{}'.format(idx), [in_size, layer_size[idx]], 'xavier']) self.var_list.append( ['b_{}'.format(idx), [layer_size[idx]], 'zero']) in_size = layer_size[idx] self.var_dict = utils.get_var(self.var_list) self.opt_algo = opt_algo self.learning_rate = learning_rate self.epoch = epoch self.early_stop_round = early_stop_round self.l2 = l2 self.random_seed = random_seed self.time_scores = [] self.train_scores = [] self.test_scores = [] # with self.graph.as_default(): if self.random_seed is not None: tf.set_random_seed(self.random_seed) self.X = [ tf.sparse_placeholder(config.DTYPE) for n in range(self.num_fields) ] self.y = tf.placeholder(config.DTYPE) with tf.variable_scope('Dense_Real_Layer'): w_embed = [ self.var_dict['embed_{}'.format(idx)] for idx in range(self.num_fields) ] xw = tf.concat([ tf.sparse_tensor_dense_matmul(self.X[idx], w_embed[idx]) for idx in range(self.num_fields) ], 1) layer_out = xw for idx in range(len(layer_size)): with tf.variable_scope('Hiden_Layer_{}'.format(idx)): wi = self.var_dict['w_{}'.format(idx)] bi = self.var_dict['b_{}'.format(idx)] layer_out = tf.nn.dropout( utils.activate( tf.matmul(layer_out, wi) + bi, self.layer_act[idx]), self.layer_keeps[idx]) layer_out = tf.squeeze(layer_out) self.y_preds = tf.sigmoid(layer_out) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=layer_out)) if self.l2 is not None: for idx in range(self.num_fields): self.loss += self.l2 * tf.nn.l2_loss( self.var_dict['embed_{}'.format(idx)]) for idx in range(len(self.layer_size)): self.loss += self.l2 * tf.nn.l2_loss( self.var_dict['w_{}'.format(idx)]) self.optimizer = utils.get_optimizer(self.opt_algo, self.learning_rate, self.loss) self.sess = tf.Session() tf.global_variables_initializer().run(session=self.sess)
def main(_): vocab = Vocab() vocab.load_from_pickle() reader = Reader(vocab) config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True with tf.Graph().as_default(), tf.Session(config=config_proto) as session: with tf.variable_scope("Model") as scope: if cfg.training: with tf.variable_scope("LR"): g_lr = tf.get_variable("g_lr", shape=[], initializer=tf.zeros_initializer, trainable=False) d_lr = tf.get_variable("d_lr", shape=[], initializer=tf.zeros_initializer, trainable=False) g_optimizer = utils.get_optimizer(g_lr, cfg.g_optimizer) d_optimizer = utils.get_optimizer(d_lr, cfg.d_optimizer) model = EncoderDecoderModel(vocab, True, use_gan=cfg.use_gan, g_optimizer=g_optimizer, d_optimizer=d_optimizer) scope.reuse_variables() eval_model = EncoderDecoderModel(vocab, False, use_gan=cfg.use_gan) else: test_model = EncoderDecoderModel(vocab, False, use_gan=cfg.use_gan) scope.reuse_variables() generator = EncoderDecoderModel(vocab, False, use_gan=cfg.use_gan, generator=True) decode_op = beam_decode_op(generator, vocab, cfg.beam_size) saver = tf.train.Saver() try: # try to restore a saved model file saver.restore(session, cfg.load_file) print("Model restored from", cfg.load_file) except ValueError: if cfg.training: tf.initialize_all_variables().run() print("No loadable model file, new model initialized.") else: print("You need to provide a valid model file for testing!") sys.exit(1) if cfg.training: steps = 0 train_perps = [] valid_perps = [] session.run(tf.assign(g_lr, cfg.g_learning_rate)) session.run(tf.assign(d_lr, cfg.d_learning_rate)) if cfg.sc_use_kld_weight: min_kld_weight = cfg.anneal_max - 1e-4 else: min_kld_weight = -1 scheduler = utils.Scheduler(cfg.min_d_acc, cfg.max_d_acc, cfg.max_perplexity, min_kld_weight, cfg.sc_list_size, cfg.sc_decay) for i in range(cfg.max_epoch): print("\nEpoch: %d" % (i + 1)) perplexity, steps = run_epoch(i, session, model, generator, reader.training(), vocab, saver, steps, cfg.max_steps, scheduler, cfg.use_gan, cfg.gen_every, decode_op) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, perplexity)) train_perps.append(perplexity) if cfg.validate_every > 0 and (i + 1) % cfg.validate_every == 0: perplexity, _ = run_epoch(i, session, eval_model, generator, reader.validation(), vocab, None, 0, -1, None, cfg.use_gan, -1, decode_op) print("Epoch: %d Validation Perplexity: %.3f" % (i + 1, perplexity)) valid_perps.append(perplexity) else: valid_perps.append(None) print('Train:', train_perps) print('Valid:', valid_perps) if steps >= cfg.max_steps: break else: print('\nTesting') perplexity, _ = run_epoch(0, session, test_model, generator, reader.testing(), vocab, None, 0, cfg.max_steps, None, cfg.use_gan, -1, decode_op) print("Test Perplexity: %.3f" % perplexity)
def train(args): print(args) dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor( dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 dataset.train_enc_graph = dataset.train_enc_graph.to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.to(args.device) print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() net.train() pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() count_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print( torch_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) rmse = ((real_pred_ratings - train_gt_ratings)**2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss / iter_idx, count_rmse / count_num, np.average(dur)) count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def __init__(self, field_size=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None, embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-3, random_seed=None): Model.__init__(self) init_vars = [] num_inputs = len(field_size) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_size[i], embed_size], 'xavier', dtype)) num_pairs = int(num_inputs * (num_inputs - 1) / 2) node_in = num_inputs * embed_size + num_pairs for i in range(len(layer_sizes)): init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype)) init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype)) node_in = layer_sizes[i] self.graph = tf.Graph() with self.graph.as_default(): if (random_seed is not None): tf.set_random_seed(random_seed) self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] # [num_inputs, field_size[i], k] xw = tf.concat([ tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs) ], 1) # [num_inputs*k] xw3d = tf.reshape( xw, [-1, num_inputs, embed_size]) # [batch, num_inputs, k] row = [] # num_pairs col = [] # num_pairs for i in range(num_inputs - 1): for j in range(i + 1, num_inputs): row.append(i) col.append(j) p = tf.transpose( tf.gather( tf.transpose(xw3d, [1, 0, 2]), # [num_inputs, batch, k] row), # [num_pairs, batch, k] [1, 0, 2]) # [batch, num_pairs, k] q = tf.transpose( tf.gather( tf.transpose(xw3d, [1, 0, 2]), # [num_inputs, batch, k] col), # [num_pairs, batch, k] [1, 0, 2]) # [batch, num_pairs, k] p = tf.reshape( p, [-1, num_pairs, embed_size]) # [batch, num_pairs, k] q = tf.reshape( q, [-1, num_pairs, embed_size]) # [batch, num_pairs, k] ip = tf.reshape(tf.reduce_sum(p * q, [-1]), [-1, num_pairs]) l = tf.concat([xw, ip], 1) # [num_inputs*k + num_pairs] for i in range(len(layer_sizes)): w = self.vars['w%d' % i] b = self.vars['b%d' % i] l = utils.activate(tf.matmul(l, w) + b, layer_acts[i]) l = tf.nn.dropout(l, self.layer_keeps[i]) print('l', l) l = tf.squeeze(l) self.y_prob = tf.sigmoid(l) print('l', l) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y)) if (layer_l2 is not None): self.loss += embed_l2 * tf.nn.l2_loss(xw) for i in range(len(layer_sizes)): w = self.vars['w%d' % i] self.loss += layer_l2 * tf.nn.l2_loss(w) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def __init__(self, params): """Creates a Trainer. """ utils.set_default_param_values_and_env_vars(params) self.params = params # Setup logging & log the version. utils.setup_logging(params.logging_verbosity) self.job_name = self.params.job_name # "" for local training self.is_distributed = bool(self.job_name) self.task_index = self.params.task_index self.local_rank = self.params.local_rank self.start_new_model = self.params.start_new_model self.train_dir = self.params.train_dir self.num_gpus = self.params.num_gpus if self.num_gpus and not self.is_distributed: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size # print self.params parameters if self.start_new_model and self.local_rank == 0: pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) if self.local_rank == 0: logging.info("PyTorch version: {}.".format(torch.__version__)) logging.info("NCCL Version {}".format(torch.cuda.nccl.version())) logging.info("Hostname: {}.".format(socket.gethostname())) if self.is_distributed: self.num_nodes = len(params.worker_hosts.split(';')) self.world_size = self.num_nodes * self.num_gpus self.rank = self.task_index * self.num_gpus + self.local_rank dist.init_process_group(backend='nccl', init_method='env://', timeout=datetime.timedelta(seconds=30)) if self.local_rank == 0: logging.info('World Size={} => Total batch size {}'.format( self.world_size, self.batch_size * self.world_size)) self.is_master = bool(self.rank == 0) else: self.world_size = 1 self.is_master = True # create a mesage builder for logging self.message = utils.MessageBuilder() # load reader and model self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.num_gpus, is_training=True) # load model self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=True) # add normalization as first layer of model if self.params.add_normalization: # In order to certify radii in original coordinates rather than standardized coordinates, we # add the noise _before_ standardizing, which is why we have standardization be the first # layer of the classifier rather than as a part of preprocessing as is typical. normalize_layer = self.reader.get_normalize_layer() self.model = torch.nn.Sequential(normalize_layer, self.model) # define DistributedDataParallel job self.model = SyncBatchNorm.convert_sync_batchnorm(self.model) torch.cuda.set_device(params.local_rank) self.model = self.model.cuda() i = params.local_rank self.model = DistributedDataParallel(self.model, device_ids=[i], output_device=i) if self.local_rank == 0: logging.info('Model defined with DistributedDataParallel') # define set for saved ckpt self.saved_ckpts = set([0]) # define optimizer self.optimizer = utils.get_optimizer(self.params.optimizer, self.params.optimizer_params, self.params.init_learning_rate, self.params.weight_decay, self.model.parameters()) # define learning rate scheduler self.scheduler = utils.get_scheduler(self.optimizer, self.params.lr_scheduler, self.params.lr_scheduler_params) # if start_new_model is False, we restart training if not self.start_new_model: if self.local_rank == 0: logging.info('Restarting training...') self._load_state() # define Lipschitz regularization module if self.params.lipschitz_regularization: if self.local_rank == 0: logging.info( "Lipschitz regularization with decay {}, start after epoch {}" .format(self.params.lipschitz_decay, self.params.lipschitz_start_epoch)) self.lipschitz = LipschitzRegularization(self.model, self.params, self.reader, self.local_rank) # exponential moving average self.ema = None if getattr(self.params, 'ema', False) > 0: self.ema = utils.EMA(self.params.ema) # if adversarial training, create the attack class if self.params.adversarial_training: if self.local_rank == 0: logging.info('Adversarial Training') attack_params = self.params.adversarial_training_params if 'eps_iter' in attack_params.keys( ) and attack_params['eps_iter'] == -1: eps = attack_params['eps'] n_iter = attack_params['nb_iter'] attack_params['eps_iter'] = eps / n_iter * 2 if self.local_rank == 0: logging.info('Learning rate for attack: {}'.format( attack_params['eps_iter'])) self.attack = utils.get_attack( self.model, self.reader.n_classes, self.params.adversarial_training_name, attack_params) # init noise if self.params.adaptive_noise and self.params.additive_noise: raise ValueError( "Adaptive and Additive Noise should not be set together") if self.params.adaptive_noise: if self.local_rank == 0: logging.info('Training with Adaptive Noise: {} {}'.format( self.params.noise_distribution, self.params.noise_scale)) elif self.params.additive_noise: if self.local_rank == 0: logging.info('Training with Noise: {} {}'.format( self.params.noise_distribution, self.params.noise_scale)) if self.params.adaptive_noise or self.params.additive_noise: self.noise = utils.Noise(self.params) # stability training if self.params.stability_training: if self.local_rank == 0: logging.info("Training with Stability Training: {}".format( self.params.stability_training_lambda)) if not any([ self.params.adversarial_training, self.params.adaptive_noise, self.params.additive_noise ]): raise ValueError( "Adversarial Training or Adaptive Noise should be activated" )
if __name__ == "__main__": config = get_config() # Setting seed for reproducability set_seed() # Get data train_dataset, val_dataset = get_datasets(config) # MODEL model = get_model(config['model']) # Define loss loss = CrossEntropy() # Define optimizer optimizer = get_optimizer(config['train']['optimizer'], model, loss) # Main loop train_loss_hist, val_loss_hist, val_acc_hist = list(), list(), list() pbar = tqdm(range(config['train']['epochs'])) lr_decay_config = config['train']['lr_decay'] for i in pbar: # TRAINING model.train() for j in range(len(train_dataset)): x, y = train_dataset[j] train_loss = optimizer.step(x, y) if lr_decay_config['use']: optimizer.decay_learning_rate(i, lr_decay_config['decay_fraction'], lr_decay_config['decay_frequency']) train_loss_hist.append((i + j / len(train_dataset), train_loss))
def main(_): data_tr, labels_tr, data_te, labels_te, unlabeled = input_data.load_data( FLAGS.dataset_name, FLAGS.num_labeled) print(" train shapes:", data_tr.shape, labels_tr.shape) print(" test shapes:", data_te.shape, labels_te.shape) print("unlabeled shapes:", unlabeled.shape) data_tr_batch, labels_tr_batch = u.load_shuffle_batch( data_tr, labels_tr, batch_size=FLAGS.batch_size, capacity=FLAGS.batch_size * 100, min_after_dequeue=FLAGS.batch_size * 20) data_te_batch, labels_te_batch = u.load_batch(data_te, labels_te, FLAGS.batch_size) data_unlabeled_batch, _ = u.load_batch(unlabeled, np.zeros(unlabeled.shape[0]), FLAGS.batch_size) with tf.variable_scope('model') as scope: model = models.get_model(FLAGS.model_name) logits_tr = model(data_tr_batch, is_training=True) scope.reuse_variables() logits_te = model(data_te_batch, is_training=False) loss_tr = u.get_supervised_loss(logits=logits_tr, labels=labels_tr_batch) loss_te = u.get_supervised_loss(logits=logits_te, labels=labels_te_batch) acc_tr = u.get_accuracy(logits_tr, labels_tr_batch) acc_te = u.get_accuracy(logits_te, labels_te_batch) step = tf.Variable(0, trainable=False, dtype=tf.int32) optimizer = u.get_optimizer(FLAGS.optimizer_type, FLAGS.learning_rate, step, FLAGS.lr_decay_steps, FLAGS.lr_decay_factor) train_op = u.get_train_op(optimizer, loss_tr, step) with tf.Session() as sess: def eval_test(): loss = 0.0 acc = 0.0 eval_iters = int(data_te.shape[0] / FLAGS.batch_size) for j in range(eval_iters): l, a = sess.run([loss_te, acc_te]) loss += l acc += a loss /= eval_iters acc /= eval_iters return loss, acc # initialize the variables init_op = tf.global_variables_initializer() sess.run(init_op) # initialize the queue threads to start to shovel data coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) for i in tqdm(range(FLAGS.num_iters)): _, cur_loss_tr, cur_acc_tr = sess.run([train_op, loss_tr, acc_tr]) if i % FLAGS.eval_interval == 0: print('train loss: %.4f train acc: %.4f' % (cur_loss_tr, cur_acc_tr)) cur_loss_te, cur_acc_te = eval_test() print(' test loss: %.4f test acc: %.4f' % (cur_loss_te, cur_acc_te)) # stop our queue threads and properly close the session coord.request_stop() coord.join(threads) sess.close()
def TFGAN(inputs,targets): traindir = os.path.join(logdir, 'GG12\\PIX2PIX_MINMAX_1024') if tf.gfile.Exists(traindir): tf.gfile.DeleteRecursively(traindir) tf.gfile.MakeDirs(traindir) # Create a GANModel tuple. fiber_output, fiber_input = inputs encoder, label = targets real_data = tf.concat((label,fiber_input),-1) ####################################################################### ########################## GAN MODEL ################################# ####################################################################### gan_model = tfgan.gan_model( generator_fn=generator_fn, discriminator_fn=pix2pix_D, real_data=real_data, generator_inputs=fiber_output, generator_scope='Generator', discriminator_scope='Discriminator') ####################################################################### ########################## GAN SUMMARY ############################### ####################################################################### with tf.name_scope('Train_summary'): generated_data, generated_input = tf.split(gan_model.generated_data,2,-1) reshaped_fiber_input = get_summary_image(fiber_input, FLAGS.grid_size) reshaped_label = get_summary_image(label, FLAGS.grid_size) reshaped_generated_input = get_summary_image(generated_input, FLAGS.grid_size) reshaped_generated_data = get_summary_image(generated_data, FLAGS.grid_size) tf.summary.image('Input_Fiber', reshaped_fiber_input) tf.summary.image('Input_Generator', reshaped_generated_input) tf.summary.image('Data_Real', reshaped_label) tf.summary.image('Data_Generator', reshaped_generated_data) ####################################################################### ########################## GAN LOSS ################################# ####################################################################### with tf.name_scope('pixel_loss'): pixel_loss = combine_loss(gan_model.generated_data, gan_model.real_data, add_summary=True) with tf.name_scope('gan_loss'): gan_loss = tfgan.gan_loss( gan_model, generator_loss_fn=tfgan.losses.modified_generator_loss, discriminator_loss_fn=tfgan.losses.modified_discriminator_loss, gradient_penalty_weight=1.0, # only in wassertein_loss ) tfgan.eval.add_regularization_loss_summaries(gan_model) with tf.name_scope('Train_Loss'): gan_loss = tfgan.losses.combine_adversarial_loss( gan_loss, gan_model, pixel_loss, weight_factor=FLAGS.adversarial_loss_weight) ####################################################################### ########################## GAN OPS ################################ ####################################################################### with tf.name_scope('Train_ops'): gen_lr = get_lr(1e-5,decay_steps=5000) dis_lr = get_lr(5e-5,decay_steps=5000) train_ops = tfgan.gan_train_ops( gan_model, gan_loss, generator_optimizer=get_optimizer(gen_lr), discriminator_optimizer=get_optimizer(dis_lr), # summarize_gradients=False, # colocate_gradients_with_ops=True, # transform_grads_fn=tf.contrib.training.clip_gradient_norms_fn(1e3), # aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) ) psnr = tf.reduce_mean(tf.image.psnr(generated_data, label, max_val = 1.0)) ssim = tf.reduce_mean(tf.image.ssim(generated_data, label, max_val = 1.0)) corr = correlation(generated_data, label) tf.summary.scalar('PSNR', psnr) tf.summary.scalar('SSIM', ssim) tf.summary.scalar('Relation', corr) tf.summary.scalar('generator_lr', gen_lr) # tf.summary.scalar('discriminator_lr', dis_lr) ####################################################################### ########################## GAN TRAIN ############################## ####################################################################### train_steps = tfgan.GANTrainSteps(generator_train_steps=1, discriminator_train_steps=1) message = tf.string_join([' Train step: ', tf.as_string(tf.train.get_or_create_global_step()), ' PSNR:', tf.as_string(psnr), ' SSIM:', tf.as_string(ssim), ' Correlation:', tf.as_string(corr) ], name='status_message') tfgan.gan_train(train_ops, logdir = traindir, get_hooks_fn=tfgan.get_joint_train_hooks(train_steps), hooks=[tf.train.StopAtStepHook(num_steps=FLAGS.max_iter), tf.train.LoggingTensorHook([message], every_n_iter=FLAGS.log_n_steps), get_tfgan_init_fn('E:\GitHub\MMFI\log\\GG12\\CNN', 'Generator'), # get_tfgan_init_fn('E:\GitHub\MMFI\log\\G2\\pix2pix_D', 'Discriminator'), ], save_summaries_steps = FLAGS.save_summaries_steps*2, save_checkpoint_secs = FLAGS.save_interval_secs)
def Generator_all(inputs,targets): traindir = os.path.join(logdir, 'GG12\\CNN') if tf.gfile.Exists(traindir): tf.gfile.DeleteRecursively(traindir) tf.gfile.MakeDirs(traindir) fiber_output,fiber_input = inputs encoder, label = targets with tf.variable_scope('Generator'): with tf.variable_scope('G1'): generated_input = pix2pix_G(fiber_output) * circle(FLAGS.input_size,FLAGS.input_size) with tf.variable_scope('G2'): generated_data = pix2pix_G(generated_input) * circle(FLAGS.input_size,FLAGS.input_size) with tf.name_scope('Train_summary'): reshaped_fiber_input = get_summary_image(fiber_input,FLAGS.grid_size) reshaped_label = get_summary_image(label,FLAGS.grid_size) reshaped_generated_input = get_summary_image(generated_input,FLAGS.grid_size) reshaped_generated_data = get_summary_image(generated_data,FLAGS.grid_size) tf.summary.image('Input_Fiber', reshaped_fiber_input) tf.summary.image('Input_Generator', reshaped_generated_input) tf.summary.image('Data_Real', reshaped_label) tf.summary.image('Data_Generator', reshaped_generated_data) with tf.name_scope('g1_loss'): G1_loss = combine_loss(generated_input, fiber_input, add_summary=True) with tf.name_scope('g2_loss'): G2_loss = combine_loss(generated_data, label, add_summary=True) with tf.name_scope('Train_Loss'): reg_loss = tf.losses.get_regularization_loss() total_loss = G1_loss + G2_loss + reg_loss total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') tf.summary.scalar('Regularization_loss',reg_loss) tf.summary.scalar('G1_loss', G1_loss) tf.summary.scalar('G2_loss', G2_loss) tf.summary.scalar('Total_loss',total_loss) lr = get_lr(1e-5,decay_steps=5000) optimizer = get_optimizer(lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = slim.learning.create_train_op(total_loss, optimizer, update_ops =update_ops, variables_to_train= tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Generator') ) with tf.name_scope('Train_ops'): psnr = tf.reduce_mean(tf.image.psnr(generated_data, label, max_val=1.0)) ssim = tf.reduce_mean(tf.image.ssim(generated_data, label, max_val=1.0)) corr = correlation(generated_data, label) tf.summary.scalar('PSNR', psnr) tf.summary.scalar('SSIM', ssim) tf.summary.scalar('Relation', corr) tf.summary.scalar('Learning_rate', lr) slim.learning.train(train_op, traindir, number_of_steps =FLAGS.max_iter, log_every_n_steps=FLAGS.log_n_steps, init_fn=get_multimodel_init_fn(ckpt1='E:\GitHub\MMFI\log\\G1\\pix2pix_G', include1='Generator/G1', ckpt2='E:\GitHub\MMFI\log\\G2\\pix2pix_G', include2='Generator/G2'), save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs = FLAGS.save_interval_secs)
def main(): opt = get_model_config() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(opt) # Model setting logger.info('Build Model') generator = define_G(3, 3, opt.ngf).to(device) total_param = sum([p.numel() for p in generator.parameters()]) logger.info(f'Generator size: {total_param} tensors') discriminator = define_D(3 + 3, opt.ndf, opt.disc).to(device) total_param = sum([p.numel() for p in discriminator.parameters()]) logger.info(f'Discriminator size: {total_param} tensors') if torch.cuda.device_count() > 1: logger.info(f"Let's use {torch.cuda.device_count()} GPUs!") generator = DataParallel(generator) discriminator = DataParallel(discriminator) if opt.mode == 'train': dirname = datetime.now().strftime("%m%d%H%M") + f'_{opt.name}' log_dir = os.path.join('./experiments', dirname) os.makedirs(log_dir, exist_ok=True) logger.info(f'LOG DIR: {log_dir}') # Dataset setting logger.info('Set the dataset') image_size: Tuple[int] = (opt.image_h, opt.image_w) train_transform, val_transform = get_transforms( image_size, augment_type=opt.augment_type, image_norm=opt.image_norm) trainset = TrainDataset(image_dir=os.path.join(opt.data_dir, 'train'), transform=train_transform) valset = TrainDataset(image_dir=os.path.join(opt.data_dir, 'val'), transform=val_transform) train_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_loader = DataLoader(dataset=valset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers) # Loss setting criterion = {} criterion['gan'] = GANLoss(use_lsgan=True).to(device) criterion['l1'] = torch.nn.L1Loss().to(device) # Optimizer setting g_optimizer = get_optimizer(generator.parameters(), opt.optimizer, opt.lr, opt.weight_decay) d_optimizer = get_optimizer(discriminator.parameters(), opt.optimizer, opt.lr, opt.weight_decay) logger.info( f'Initial Learning rate(G): {g_optimizer.param_groups[0]["lr"]:.6f}' ) logger.info( f'Initial Learning rate(D): {d_optimizer.param_groups[0]["lr"]:.6f}' ) # Scheduler setting g_scheduler = get_scheduler(g_optimizer, opt.scheduler, opt) d_scheduler = get_scheduler(d_optimizer, opt.scheduler, opt) # Tensorboard setting writer = SummaryWriter(log_dir=log_dir) logger.info('Start to train!') train_process(opt, generator, discriminator, criterion, g_optimizer, d_optimizer, g_scheduler, d_scheduler, train_loader=train_loader, val_loader=val_loader, log_dir=log_dir, writer=writer, device=device) # TODO: write inference code elif opt.mode == 'test': logger.info(f'Model loaded from {opt.checkpoint}') model.eval() logger.info('Start to test!') test_status = inference(model=model, test_loader=test_loader, device=device, criterion=criterion)
def do_train(sess, args): # set CPU as the default device for the graph. Some of the operations will be moved to GPU later. with tf.device('/cpu:0'): # Images and labels placeholders images_ph= tf.placeholder(tf.float32, shape=(None,)+ tuple(args.processed_size), name='input') labels_ph= tf.placeholder(tf.int32, shape=(None), name='label') # a placeholder for determining if we train or validate the network. This placeholder will be used to set dropout rates and batchnorm paramaters. is_training_ph= tf.placeholder(tf.bool, name='is_training') #epoch number epoch_number = tf.get_variable('epoch_number', [], dtype= tf.int32, initializer= tf.constant_initializer(0), trainable= False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES]) global_step = tf.get_variable('global_step', [], dtype= tf.int32, initializer= tf.constant_initializer(0), trainable= False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES]) # Weight Decay policy wd = utils.get_policy(args.WD_policy, args.WD_details) # Learning rate decay policy (if needed) lr = utils.get_policy(args.LR_policy, args.LR_details) # Create an optimizer that performs gradient descent. optimizer = utils.get_optimizer(args.optimizer, lr) # build the computational graph using the provided configuration. dnn_model= model(images_ph, labels_ph, utils.loss, optimizer, wd, args.architecture, args.depth, args.num_classes, is_training_ph, args.transfer_mode, num_gpus= args.num_gpus) # Create a pipeline to read data from disk # a placeholder for setting the input pipeline batch size. This is employed to ensure that we feed each validation example only once to the network. # Because we only use 1 GPU for validation, the validation batch size should not be more than 512. batch_size_tf= tf.placeholder_with_default(min(512, args.batch_size), shape=()) # A data loader pipeline to read training images and their labels train_loader= loader(args.train_info, args.delimiter, args.raw_size, args.processed_size, True, args.chunked_batch_size, args.num_prefetch, args.num_threads, args.path_prefix, args.shuffle) # The loader returns images, their labels, and their paths images, labels, info = train_loader.load() # If validation data are provided, we create an input pipeline to load the validation data if args.run_validation: val_loader= loader(args.val_info, args.delimiter, args.raw_size, args.processed_size, False, batch_size_tf, args.num_prefetch, args.num_threads, args.path_prefix) val_images, val_labels, val_info = val_loader.load() # Get training operations to run from the deep learning model train_ops = dnn_model.train_ops() # Build an initialization operation to run below. init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) if args.retrain_from is not None: dnn_model.load(sess, args.retrain_from) # Set the start epoch number start_epoch = sess.run(epoch_number + 1) # Start the queue runners. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Setup a summary writer summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # The main training loop for epoch in range(start_epoch, start_epoch + args.num_epochs): # update epoch_number sess.run(epoch_number.assign(epoch)) print("Epoch %d started"%(epoch)) # Trainig batches for step in range(args.num_batches): sess.run(global_step.assign(step+epoch*args.num_batches)) # train the network on a batch of data (It also measures time) start_time = time.time() # load a batch from input pipeline img,lbl = sess.run([images, labels], options= args.run_options, run_metadata= args.run_metadata) # train on the loaded batch of data _, loss_value, top1_accuracy, topn_accuracy = sess.run(train_ops, feed_dict = {images_ph: img, labels_ph: lbl, is_training_ph: True}, options= args.run_options, run_metadata= args.run_metadata) duration = time.time() - start_time # Check for errors assert not np.isnan(loss_value), 'Model diverged with loss = NaN' # Logging every ten batches and writing tensorboard summaries every hundred batches if step % 10 == 0: num_examples_per_step = args.chunked_batch_size * args.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / args.num_gpus # Log format_str = ('%s: epoch %d, step %d, loss = %.2f, Top-1 = %.2f Top-'+str(args.top_n)+' = %.2f (%.1f examples/sec; %.3f sec/batch)') print(format_str % (datetime.now(), epoch, step, loss_value, top1_accuracy, topn_accuracy, examples_per_sec, sec_per_batch)) sys.stdout.flush() if step % 100 == 0: summary_str = sess.run(tf.summary.merge_all(), feed_dict={images_ph: img, labels_ph: lbl, is_training_ph: True}) summary_writer.add_summary(summary_str, args.num_batches * epoch + step) if args.log_debug_info: summary_writer.add_run_metadata(run_metadata, 'epoch%d step%d' % (epoch, step)) # Save the model checkpoint periodically after each training epoch checkpoint_path = os.path.join(args.log_dir, args.snapshot_prefix) dnn_model.save(sess, checkpoint_path, global_step= epoch) print("Epoch %d ended. a checkpoint saved at %s"%(epoch,args.log_dir)) sys.stdout.flush() # if validation data are provided, evaluate accuracy on the validation set after the end of each epoch if args.run_validation: print("Evaluating on validation set") true_predictions_count = 0 # Counts the number of correct predictions true_topn_predictions_count = 0 # Counts the number of top-n correct predictions total_loss= 0.0 # measures cross entropy loss all_count = 0 # Count the total number of examples # The validation loop for step in range(args.num_val_batches): # Load a batch of data val_img,val_lbl = sess.run([val_images, val_labels], feed_dict={batch_size_tf: args.num_val_samples%min(512, args.batch_size)} if step== args.num_val_batches-1 else None, options= args.run_options, run_metadata= args.run_metadata) # validate the network on the loaded batch val_loss, top1_predictions, topn_predictions = sess.run([train_ops[1], train_ops[2], train_ops[3]], feed_dict={ images_ph: val_img, labels_ph: val_lbl, is_training_ph: False}, options= args.run_options, run_metadata= args.run_metadata) all_count += val_lbl.shape[0] true_predictions_count += int(round(val_lbl.shape[0]*top1_predictions)) true_topn_predictions_count += int(round(val_lbl.shape[0]*topn_predictions)) total_loss += val_loss*val_lbl.shape[0] if step%10==0: print("Validation step %d of %d"%(step, args.num_val_batches)) sys.stdout.flush() print("Total number of validation examples %d, Loss %.2f, Top-1 Accuracy %.2f, Top-%d Accuracy %.2f" % (all_count, total_loss/all_count, true_predictions_count/all_count, args.top_n, true_topn_predictions_count/all_count)) sys.stdout.flush() coord.request_stop() coord.join(threads) sess.close()
def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None, embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None, layer_norm=True): Model.__init__(self) init_vars = [] num_inputs = len(field_sizes) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype)) num_pairs = int(num_inputs * (num_inputs - 1) / 2) node_in = num_inputs * embed_size + num_pairs init_vars.append(('kernel', [embed_size, num_pairs, embed_size], 'xavier', dtype)) for i in range(len(layer_sizes)): init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype)) init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype)) node_in = layer_sizes[i] self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] xw = tf.concat([ tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs) ], 1) xw3d = tf.reshape(xw, [-1, num_inputs, embed_size]) row = [] col = [] for i in range(num_inputs - 1): for j in range(i + 1, num_inputs): row.append(i) col.append(j) # batch * pair * k p = tf.transpose( # pair * batch * k tf.gather( # num * batch * k tf.transpose(xw3d, [1, 0, 2]), row), [1, 0, 2]) # batch * pair * k q = tf.transpose(tf.gather(tf.transpose(xw3d, [1, 0, 2]), col), [1, 0, 2]) # b * p * k p = tf.reshape(p, [-1, num_pairs, embed_size]) # b * p * k q = tf.reshape(q, [-1, num_pairs, embed_size]) # k * p * k k = self.vars['kernel'] # batch * 1 * pair * k p = tf.expand_dims(p, 1) # batch * pair kp = tf.reduce_sum( # batch * pair * k tf.multiply( # batch * pair * k tf.transpose( # batch * k * pair tf.reduce_sum( # batch * k * pair * k tf.multiply(p, k), -1), [0, 2, 1]), q), -1) # # if layer_norm: # # x_mean, x_var = tf.nn.moments(xw, [1], keep_dims=True) # # xw = (xw - x_mean) / tf.sqrt(x_var) # # x_g = tf.Variable(tf.ones([num_inputs * embed_size]), name='x_g') # # x_b = tf.Variable(tf.zeros([num_inputs * embed_size]), name='x_b') # # x_g = tf.Print(x_g, [x_g[:10], x_b]) # # xw = xw * x_g + x_b # p_mean, p_var = tf.nn.moments(op, [1], keep_dims=True) # op = (op - p_mean) / tf.sqrt(p_var) # p_g = tf.Variable(tf.ones([embed_size**2]), name='p_g') # p_b = tf.Variable(tf.zeros([embed_size**2]), name='p_b') # # p_g = tf.Print(p_g, [p_g[:10], p_b]) # op = op * p_g + p_b l = tf.concat([xw, kp], 1) for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] bi = self.vars['b%d' % i] l = tf.nn.dropout( utils.activate(tf.matmul(l, wi) + bi, layer_acts[i]), self.layer_keeps[i]) l = tf.squeeze(l) self.y_prob = tf.sigmoid(l) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y)) if layer_l2 is not None: self.loss += embed_l2 * tf.nn.l2_loss(xw) #tf.concat(w0, 0)) for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] self.loss += layer_l2[i] * tf.nn.l2_loss(wi) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def __init__(self, field_sizes=None, embed_size=10, filter_sizes=None, layer_acts=None, drop_out=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None): Model.__init__(self) init_vars = [] num_inputs = len(field_sizes) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype)) init_vars.append(('f1', [embed_size, filter_sizes[0], 1, 2], 'xavier', dtype)) init_vars.append(('f2', [embed_size, filter_sizes[1], 2, 2], 'xavier', dtype)) init_vars.append(('w1', [2 * 3 * embed_size, 1], 'xavier', dtype)) init_vars.append(('b1', [1], 'zero', dtype)) self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] xw = tf.concat([ tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs) ], 1) l = xw l = tf.transpose(tf.reshape(l, [-1, num_inputs, embed_size, 1]), [0, 2, 1, 3]) f1 = self.vars['f1'] l = tf.nn.conv2d(l, f1, [1, 1, 1, 1], 'SAME') l = tf.transpose( utils.max_pool_4d(tf.transpose(l, [0, 1, 3, 2]), int(num_inputs / 2)), [0, 1, 3, 2]) f2 = self.vars['f2'] l = tf.nn.conv2d(l, f2, [1, 1, 1, 1], 'SAME') l = tf.transpose( utils.max_pool_4d(tf.transpose(l, [0, 1, 3, 2]), 3), [0, 1, 3, 2]) l = tf.nn.dropout( utils.activate(tf.reshape(l, [-1, embed_size * 3 * 2]), layer_acts[0]), self.layer_keeps[0]) w1 = self.vars['w1'] b1 = self.vars['b1'] l = tf.matmul(l, w1) + b1 l = tf.squeeze(l) self.y_prob = tf.sigmoid(l) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y)) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None, embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None): Model.__init__(self) init_vars = [] num_inputs = len(field_sizes) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype)) num_pairs = int(num_inputs * (num_inputs - 1) / 2) node_in = num_inputs * embed_size + num_pairs # node_in = num_inputs * (embed_size + num_inputs) for i in range(len(layer_sizes)): init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype)) init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype)) node_in = layer_sizes[i] self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1) xw3d = tf.reshape(xw, [-1, num_inputs, embed_size]) row = [] col = [] for i in range(num_inputs-1): for j in range(i+1, num_inputs): row.append(i) col.append(j) # batch * pair * k p = tf.transpose( # pair * batch * k tf.gather( # num * batch * k tf.transpose( xw3d, [1, 0, 2]), row), [1, 0, 2]) # batch * pair * k q = tf.transpose( tf.gather( tf.transpose( xw3d, [1, 0, 2]), col), [1, 0, 2]) p = tf.reshape(p, [-1, num_pairs, embed_size]) q = tf.reshape(q, [-1, num_pairs, embed_size]) ip = tf.reshape(tf.reduce_sum(p * q, [-1]), [-1, num_pairs]) # simple but redundant # batch * n * 1 * k, batch * 1 * n * k # ip = tf.reshape( # tf.reduce_sum( # tf.expand_dims(xw3d, 2) * # tf.expand_dims(xw3d, 1), # 3), # [-1, num_inputs**2]) l = tf.concat([xw, ip], 1) for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] bi = self.vars['b%d' % i] l = tf.nn.dropout( utils.activate( tf.matmul(l, wi) + bi, layer_acts[i]), self.layer_keeps[i]) l = tf.squeeze(l) self.y_prob = tf.sigmoid(l) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y)) if layer_l2 is not None: self.loss += embed_l2 * tf.nn.l2_loss(xw) for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] self.loss += layer_l2[i] * tf.nn.l2_loss(wi) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def main(): import argparse parser = argparse.ArgumentParser( description="imsitu VSRL. Training, evaluation and prediction.") parser.add_argument("--gpuid", default=-1, help="put GPU id > -1 in GPU mode", type=int) #parser.add_argument("--command", choices = ["train", "eval", "resume", 'predict'], required = True) parser.add_argument('--resume_training', action='store_true', help='Resume training from the model [resume_model]') parser.add_argument('--resume_model', type=str, default='', help='The model we resume') parser.add_argument('--verb_module', type=str, default='', help='pretrained verb module') parser.add_argument('--train_role', action='store_true', help='cnn fix, verb fix, role train from the scratch') parser.add_argument( '--finetune_verb', action='store_true', help='cnn fix, verb finetune, role train from the scratch') parser.add_argument( '--finetune_cnn', action='store_true', help='cnn finetune, verb finetune, role train from the scratch') parser.add_argument('--output_dir', type=str, default='./trained_models', help='Location to output the model') parser.add_argument('--evaluate', action='store_true', help='Only use the testing mode') #todo: train role module separately with gt verbs args = parser.parse_args() batch_size = 640 #lr = 5e-6 lr = 0.0001 lr_max = 5e-4 lr_gamma = 0.1 lr_step = 25 clip_norm = 50 weight_decay = 1e-4 n_epoch = 500 n_worker = 3 dataset_folder = 'imSitu' imgset_folder = 'resized_256' train_set = json.load(open(dataset_folder + "/train.json")) encoder = imsitu_encoder(train_set) model = model_vsrl_small_finetune.RelationNetworks(encoder, args.gpuid) # To group up the features cnn_features, verb_features, role_features = utils.group_features(model) train_set = imsitu_loader(imgset_folder, train_set, encoder, model.train_preprocess()) train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True, num_workers=n_worker) dev_set = json.load(open(dataset_folder + "/dev.json")) dev_set = imsitu_loader(imgset_folder, dev_set, encoder, model.train_preprocess()) dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=32, shuffle=True, num_workers=n_worker) traindev_set = json.load(open(dataset_folder + "/dev.json")) traindev_set = imsitu_loader(imgset_folder, traindev_set, encoder, model.train_preprocess()) traindev_loader = torch.utils.data.DataLoader(traindev_set, batch_size=8, shuffle=True, num_workers=n_worker) utils.set_trainable(model, False) if args.train_role: print('CNN fix, Verb fix, train role from the scratch from: {}'.format( args.verb_module)) args.train_all = False if len(args.verb_module) == 0: raise Exception('[pretrained verb module] not specified') utils.load_net(args.verb_module, [model.conv, model.verb], ['conv', 'verb']) optimizer_select = 1 model_name = 'cfx_vfx_rtrain' elif args.finetune_verb: print('CNN fix, Verb finetune, train role from the scratch from: {}'. format(args.verb_module)) args.train_all = True if len(args.verb_module) == 0: raise Exception('[pretrained verb module] not specified') utils.load_net(args.verb_module, [model.conv, model.verb], ['conv', 'verb']) optimizer_select = 2 model_name = 'cfx_vft_rtrain' elif args.finetune_cnn: print( 'CNN finetune, Verb finetune, train role from the scratch from: {}' .format(args.verb_module)) args.train_all = True if len(args.verb_module) == 0: raise Exception('[pretrained verb module] not specified') utils.load_net(args.verb_module, [model.conv, model.verb], ['conv', 'verb']) optimizer_select = 3 model_name = 'cft_vft_rtrain' elif args.resume_training: print('Resume training from: {}'.format(args.resume_model)) args.train_all = True if len(args.resume_model) == 0: raise Exception('[pretrained verb module] not specified') utils.load_net(args.resume_model, [model]) optimizer_select = 0 model_name = 'resume_all' else: if not args.evaluate: print('Training from the scratch.') optimizer_select = 0 args.train_all = True model_name = 'train_full' optimizer = utils.get_optimizer(lr, weight_decay, optimizer_select, cnn_features, verb_features, role_features) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) if args.gpuid >= 0: #print('GPU enabled') model.cuda() #optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step, gamma=lr_gamma) #gradient clipping, grad check if args.evaluate: top1, top5, val_loss = eval(model, dev_loader, encoder, args.gpuid, write_to_file=True) top1_avg = top1.get_average_results() top5_avg = top5.get_average_results() avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \ top5_avg["value"] + top5_avg["value-all"] avg_score /= 8 print('Dev average :{:.2f} {} {}'.format( avg_score * 100, utils.format_dict(top1_avg, '{:.2f}', '1-'), utils.format_dict(top5_avg, '{:.2f}', '5-'))) #write results to csv file gt_labels = top1.gt_situation pred_labels = top1.predicted_situation verb_pred = top1.verb_pred with open("gt_rn_only.csv", "w") as f: writer = csv.writer(f) writer.writerows(gt_labels) with open("pred_rn_only.csv", "w") as f: writer = csv.writer(f) writer.writerows(pred_labels) with open("verbpred_rn_only.csv", "w") as f: writer = csv.writer(f) writer.writerow(['verb', 'total', 'predicted']) for key, value in verb_pred.items(): writer.writerow([key, value[0], value[1]]) print('Writing predictions to file completed !') else: print('Model training started!') train(model, train_loader, dev_loader, traindev_loader, optimizer, scheduler, n_epoch, args.output_dir, encoder, args.gpuid, clip_norm, lr_max, model_name, args)
def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None, embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None): Model.__init__(self) init_vars = [] num_inputs = len(field_sizes) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype)) node_in = num_inputs * embed_size for i in range(len(layer_sizes) - 1): init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype)) init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype)) node_in = layer_sizes[i] init_vars.append(('w_final', [2 * node_in, layer_sizes[-1]], 'xavier', dtype)) init_vars.append(('b_final', [layer_sizes[-1]], 'zero', dtype)) self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) # 38个field--一个大的系数矩阵(6086维),其中按照每个field的维数进行分别embedding,最后再拼接 self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] xw = tf.concat([ tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs) ], 1) l = xw la = [] for i in range(len(layer_sizes) - 2): wi = self.vars['w%d' % i] bi = self.vars['b%d' % i] print(l.shape, wi.shape, bi.shape) l = tf.nn.dropout( utils.activate(tf.matmul(l, wi) + bi, layer_acts[i]), self.layer_keeps[i]) la.append(l) l_final = tf.nn.dropout( utils.activate( tf.matmul(l, self.vars['w%d' % (len(layer_sizes) - 2)]) + self.vars['b%d' % (len(layer_sizes) - 2)], layer_acts[len(layer_sizes) - 2]), self.layer_keeps[len(layer_sizes) - 2]) la_new = tf.concat([x for x in la], 0) H = tf.reshape(la_new, [-1, len(layer_sizes) - 2, layer_sizes[0] ]) # shape = [batch_size,3,128] H_T = tf.transpose(H, [0, 2, 1]) # shape=[batch_size,128,3] S_0 = tf.matmul(H, H_T) mask = [x for x in range(len(layer_sizes) - 2)] mask_zero = tf.ones([len(layer_sizes) - 2, len(layer_sizes) - 2]) - tf.one_hot( mask, len(layer_sizes) - 2) S = tf.multiply(S_0, mask_zero) print(S.shape) A = tf.nn.softmax(S, name='attention') # shape = batch_size *3*3 G = tf.reduce_sum(tf.matmul(A, H), 1) # shape = batch_size * 128 print(G.shape) M = tf.concat([l_final, G], 1) w_final = self.vars['w_final'] b_final = self.vars['b_final'] l_final = tf.matmul(M, w_final) + b_final l_final = tf.squeeze(l_final) self.y_prob = l_final self.loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits(logits=l_final, labels=self.y)) if layer_l2 is not None: self.loss += embed_l2 * tf.nn.l2_loss(xw) for i in range(len(layer_sizes) - 1): wi = self.vars['w%d' % i] self.loss += layer_l2[i] * tf.nn.l2_loss(wi) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess) self.saver = tf.train.Saver(max_to_keep=3)
def __init__(self, train_loader, test_loader, embed_size=10, product_way='in', layer_size=None, layer_act=None, layer_keeps=None, opt_algo='gd', learning_rate=0.01, epoch=10, early_stop_round=None, l2=None, random_seed=None): self.graph = tf.Graph() self.train_loader = train_loader self.test_loader = test_loader self.embed_size = embed_size self.product_way = product_way self.layer_size = layer_size self.layer_act = layer_act self.layer_keeps = layer_keeps self.num_fields = len(config.FIELD_SIZES) self.var_list = [] for idx in range(self.num_fields): self.var_list.append([ 'embed_{}'.format(idx), [config.FIELD_SIZES[idx], self.embed_size], 'xavier' ]) num_pairs = int(self.num_fields * (self.num_fields - 1) / 2) if self.product_way == 'out': self.var_list.append([ 'kernel', [self.embed_size, num_pairs, self.embed_size], 'xavier' ]) in_size = self.num_fields * self.embed_size + num_pairs for idx in range(len(layer_size)): self.var_list.append( ['w_{}'.format(idx), [in_size, layer_size[idx]], 'xavier']) self.var_list.append( ['b_{}'.format(idx), [layer_size[idx]], 'zero']) in_size = layer_size[idx] self.var_dict = utils.get_var(self.var_list) self.opt_algo = opt_algo self.learning_rate = learning_rate self.epoch = epoch self.early_stop_round = early_stop_round self.l2 = l2 self.random_seed = random_seed self.time_scores = [] self.train_scores = [] self.test_scores = [] # with self.graph.as_default(): if self.random_seed is not None: tf.set_random_seed(self.random_seed) self.X = [ tf.sparse_placeholder(config.DTYPE) for n in range(self.num_fields) ] self.y = tf.placeholder(config.DTYPE) with tf.variable_scope('Embedding_Layer'): w_embed = [ self.var_dict['embed_{}'.format(idx)] for idx in range(self.num_fields) ] xw = tf.concat([ tf.sparse_tensor_dense_matmul(self.X[idx], w_embed[idx]) for idx in range(self.num_fields) ], 1) layer_out = xw xw3d = tf.reshape(xw, [-1, self.num_fields, self.embed_size]) with tf.variable_scope('Product_Layer'): row = [] col = [] for i in range(self.num_fields - 1): for j in range(i + 1, self.num_fields): row.append(i) col.append(j) p = tf.transpose(tf.gather(tf.transpose(xw3d, [1, 0, 2]), row), [1, 0, 2]) q = tf.transpose(tf.gather(tf.transpose(xw3d, [1, 0, 2]), col), [1, 0, 2]) p = tf.reshape(p, [-1, num_pairs, self.embed_size]) q = tf.reshape(q, [-1, num_pairs, self.embed_size]) if self.product_way == 'in': product = tf.reshape(tf.reduce_sum(p * q, [-1]), [-1, num_pairs]) else: k = self.var_dict['kernel'] p = tf.expand_dims(p, 1) product = tf.reduce_sum( tf.multiply( tf.transpose(tf.reduce_sum(tf.multiply(p, k), -1), [0, 2, 1]), q), -1) layer_out = tf.concat([layer_out, product], 1) for idx in range(len(layer_size)): with tf.variable_scope('Hiden_Layer_{}'.format(idx)): wi = self.var_dict['w_{}'.format(idx)] bi = self.var_dict['b_{}'.format(idx)] layer_out = tf.nn.dropout( utils.activate( tf.matmul(layer_out, wi) + bi, self.layer_act[idx]), self.layer_keeps[idx]) layer_out = tf.squeeze(layer_out) self.y_preds = tf.sigmoid(layer_out) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=layer_out)) if self.l2 is not None: for idx in range(self.num_fields): self.loss += self.l2 * tf.nn.l2_loss( self.var_dict['embed_{}'.format(idx)]) for idx in range(len(self.layer_size)): self.loss += self.l2 * tf.nn.l2_loss( self.var_dict['w_{}'.format(idx)]) self.optimizer = utils.get_optimizer(self.opt_algo, self.learning_rate, self.loss) self.sess = tf.Session() tf.global_variables_initializer().run(session=self.sess)
# Option DATAROOT = '/home/taey16/storage' if not USE_NSML else os.path.join( DATASET_PATH[1], 'taey16', 'storage') opt = parse_option(DATAROOT, USE_NSML=USE_NSML, print_option=False) # Data Loader dataset_trn, dataset_val = get_dataloader(opt) # Loading model net = create_model(opt) # Loss Function hm_criterion, off_criterion = get_loss_function(opt) # Optimizer optimizer = get_optimizer(net, opt) scheduler = CosineAnnealingLR(optimizer, eta_min=opt.lr * opt.eta_min_ratio, T_max=(opt.max_epoch - opt.lr_warmup_epoch)) # Initial Best Score global_iter, best_nme, best_epoch = [0, 10000, 0] #NOTE: main loop for training if __name__ == "__main__": if USE_NSML: scope = locals() nsml_bind_model(nsml, scope, 0, net, optimizer) else: scope = None
def main(): ## dynamically adjust hyper-parameters for ResNets according to base_width if args.base_width != 64 and 'sat' in args.loss: factor = 64. / args.base_width args.sat_alpha = args.sat_alpha**(1. / factor) args.sat_es = int(args.sat_es * factor) print("Adaptive parameters adjustment: alpha = {:.3f}, Es = {:d}".format(args.sat_alpha, args.sat_es)) print(args) global best_prec1, best_auc # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) os.makedirs(os.path.join(args.save_dir, 'train')) os.makedirs(os.path.join(args.save_dir, 'val')) os.makedirs(os.path.join(args.save_dir, 'test')) # prepare dataset if args.dataset == 'nexperia': train_loader, num_classes, targets = get_loader(args) else: train_loader, val_loaders, test_loader, num_classes, targets = get_loader(args) model = get_model(args, num_classes, base_width=args.base_width) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.dataset=='nexperia_split': best_auc = checkpoint['best_auc'] else: best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True criterion = get_loss(args, labels=targets, num_classes=num_classes) optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) train_timeline = Timeline() val_timeline = Timeline() test_timeline = Timeline() if args.evaluate: validate(test_loader, model) return print("*" * 40) start = time.time() for epoch in range(args.start_epoch, args.epochs): scheduler.step(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, train_timeline, args.sat_es, args.dataset, args.mod) print("*" * 40) if args.dataset!='nexperia': # evaluate on validation sets prec1 = 0 if args.dataset=='nexperia_split': print('val:') val_auc = validate(val_loaders, model, epoch, val_timeline, args.dataset, 'val', criterion) print("*" * 40) print('test:') test_auc = validate(test_loader, model, epoch, test_timeline, args.dataset, 'test', criterion) else: for name, val_loader in zip(args.val_sets, val_loaders): print(name +":", end="\t") prec1 = validate(val_loader, model) print("*" * 40) if args.dataset=='nexperia_split': # remember best auc and save checkpoint is_best = val_auc > best_auc best_auc = max(val_auc, best_auc) if args.save_freq > 0 and (epoch + 1) % args.save_freq == 0: filename = 'checkpoint_{}.tar'.format(epoch + 1) else: filename = None save_checkpoint(args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_auc': best_auc, }, is_best, filename=filename) else: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if args.save_freq > 0 and (epoch + 1) % args.save_freq == 0: filename = 'checkpoint_{}.tar'.format(epoch + 1) else: filename = None save_checkpoint(args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=filename) if hasattr(criterion, 'outputs'): criterion.weights[epoch] = criterion.outputs[criterion.true_labels.index] criterion.clean_weights[epoch] = criterion.outputs[criterion.clean_labels.index] else: criterion.weights[epoch] = criterion.soft_labels[criterion.true_labels.index] criterion.clean_weights[epoch] = criterion.soft_labels[criterion.clean_labels.index] if args.dataset!='nexperia': # evaludate latest checkpoint print("Test acc of latest checkpoint:", end='\t') validate(test_loader, model, epoch, test_timeline, args.dataset, last=True) print("*" * 40) # evaluate best checkpoint if args.dataset=='nexperia_split': checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint_best.tar')) print("Best validation auc ({}th epoch): {:.2f}%".format(checkpoint['epoch'], best_auc*100.)) model.load_state_dict(checkpoint['state_dict']) print("Test acc of best checkpoint:", end='\t') validate(test_loader, model, checkpoint['epoch'], test_timeline, args.dataset, last=True) print("*" * 40) else: if len(val_loaders) > 0: checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint_best.tar')) print("Best validation acc ({}th epoch): {:.2f}%".format(checkpoint['epoch'], best_prec1)) model.load_state_dict(checkpoint['state_dict']) print("Test acc of best checkpoint:", end='\t') validate(test_loader, model, last=True) print("*" * 40) time_elapsed = time.time() - start print('It takes {:.0f}m {:.0f}s to train.'.format(time_elapsed // 60, time_elapsed % 60)) # save best result filename = 'train_results.tar' save_checkpoint(args.save_dir, { 'num_epochs': args.epochs, 'state_dict': model.state_dict(), }, is_best=True, filename=filename) # save soft label if hasattr(criterion, 'soft_labels'): out_fname = os.path.join(args.save_dir, 'updated_soft_labels.npy') np.save(out_fname, criterion.soft_labels.cpu().numpy()) print("Updated soft labels is saved to {}".format(out_fname)) # save weights change of 106 images if hasattr(criterion, 'weights'): out_fname = os.path.join(args.save_dir, 'weights_change.npy') np.save(out_fname, criterion.weights.cpu().numpy()) print("weights change is saved to {}".format(out_fname)) if hasattr(criterion, 'clean_weights'): out_fname = os.path.join(args.save_dir, 'clean_weights_change.npy') np.save(out_fname, criterion.clean_weights.cpu().numpy()) print("clean weights change is saved to {}".format(out_fname)) # save timelines train_acc_class = torch.cat(train_timeline.acc_class, dim=0) train_loss_class = torch.cat(train_timeline.loss_class, dim=0) train_acc_bi_class = torch.cat(train_timeline.acc_bi_class, dim=0) train_loss_bi_class = torch.cat(train_timeline.loss_bi_class, dim=0) train_me_class = torch.cat(train_timeline.me_class, dim=0) train_me_bi_class = torch.cat(train_timeline.me_bi_class, dim=0) val_acc_class = torch.cat(val_timeline.acc_class, dim=0) val_loss_class = torch.cat(val_timeline.loss_class, dim=0) val_acc_bi_class = torch.cat(val_timeline.acc_bi_class, dim=0) val_loss_bi_class = torch.cat(val_timeline.loss_bi_class, dim=0) val_me_class = torch.cat(val_timeline.me_class, dim=0) val_me_bi_class = torch.cat(val_timeline.me_bi_class, dim=0) test_acc_class = torch.cat(test_timeline.acc_class, dim=0) test_loss_class = torch.cat(test_timeline.loss_class, dim=0) test_acc_bi_class = torch.cat(test_timeline.acc_bi_class, dim=0) test_loss_bi_class = torch.cat(test_timeline.loss_bi_class, dim=0) test_me_class = torch.cat(test_timeline.me_class, dim=0) test_me_bi_class = torch.cat(test_timeline.me_bi_class, dim=0) np.save(os.path.join(args.save_dir, 'train', 'loss.npy'), train_timeline.loss) np.save(os.path.join(args.save_dir, 'train', 'acc.npy'), train_timeline.acc) np.save(os.path.join(args.save_dir, 'train', 'loss_bi.npy'), train_timeline.loss_bi) np.save(os.path.join(args.save_dir, 'train', 'acc_bi.npy'), train_timeline.acc_bi) np.save(os.path.join(args.save_dir, 'train', 'loss_class.npy'), train_loss_class) np.save(os.path.join(args.save_dir, 'train', 'acc_class.npy'), train_acc_class) np.save(os.path.join(args.save_dir, 'train', 'loss_bi_class.npy'), train_loss_bi_class) np.save(os.path.join(args.save_dir, 'train', 'acc_bi_class.npy'), train_acc_bi_class) np.save(os.path.join(args.save_dir, 'train', 'margin_error.npy'), train_timeline.margin_error) np.save(os.path.join(args.save_dir, 'train', 'margin_error_bi.npy'), train_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'train', 'margin_error_class.npy'), train_me_class) np.save(os.path.join(args.save_dir, 'train', 'margin_error_bi_class.npy'), train_me_bi_class) np.save(os.path.join(args.save_dir, 'train', 'auc.npy'), train_timeline.auc) np.save(os.path.join(args.save_dir, 'train', 'fpr_991.npy'), train_timeline.fpr_991) np.save(os.path.join(args.save_dir, 'train', 'fpr_993.npy'), train_timeline.fpr_993) np.save(os.path.join(args.save_dir, 'train', 'fpr_995.npy'), train_timeline.fpr_995) np.save(os.path.join(args.save_dir, 'train', 'fpr_997.npy'), train_timeline.fpr_997) np.save(os.path.join(args.save_dir, 'train', 'fpr_999.npy'), train_timeline.fpr_999) np.save(os.path.join(args.save_dir, 'train', 'fpr_1.npy'), train_timeline.fpr_1) print("other training details are saved to {}".format(os.path.join(args.save_dir, 'train'))) np.save(os.path.join(args.save_dir, 'val', 'loss.npy'), val_timeline.loss) np.save(os.path.join(args.save_dir, 'val', 'acc.npy'), val_timeline.acc) np.save(os.path.join(args.save_dir, 'val', 'loss_bi.npy'), val_timeline.loss_bi) np.save(os.path.join(args.save_dir, 'val', 'acc_bi.npy'), val_timeline.acc_bi) np.save(os.path.join(args.save_dir, 'val', 'loss_class.npy'), val_loss_class) np.save(os.path.join(args.save_dir, 'val', 'acc_class.npy'), val_acc_class) np.save(os.path.join(args.save_dir, 'val', 'loss_bi_class.npy'), val_loss_bi_class) np.save(os.path.join(args.save_dir, 'val', 'acc_bi_class.npy'), val_acc_bi_class) np.save(os.path.join(args.save_dir, 'val', 'margin_error.npy'), val_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'val', 'margin_error_bi.npy'), val_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'val', 'margin_error_class.npy'), val_me_class) np.save(os.path.join(args.save_dir, 'val', 'margin_error_bi_class.npy'), val_me_bi_class) np.save(os.path.join(args.save_dir, 'val', 'auc.npy'), val_timeline.auc) np.save(os.path.join(args.save_dir, 'val', 'fpr_991.npy'), val_timeline.fpr_991) np.save(os.path.join(args.save_dir, 'val', 'fpr_993.npy'), val_timeline.fpr_993) np.save(os.path.join(args.save_dir, 'val', 'fpr_995.npy'), val_timeline.fpr_995) np.save(os.path.join(args.save_dir, 'val', 'fpr_997.npy'), val_timeline.fpr_997) np.save(os.path.join(args.save_dir, 'val', 'fpr_999.npy'), val_timeline.fpr_999) np.save(os.path.join(args.save_dir, 'val', 'fpr_1.npy'), val_timeline.fpr_1) print("other validating details are saved to {}".format(os.path.join(args.save_dir, 'val'))) np.save(os.path.join(args.save_dir, 'test', 'loss.npy'), test_timeline.loss) np.save(os.path.join(args.save_dir, 'test', 'acc.npy'), test_timeline.acc) np.save(os.path.join(args.save_dir, 'test', 'loss_bi.npy'), test_timeline.loss_bi) np.save(os.path.join(args.save_dir, 'test', 'acc_bi.npy'), test_timeline.acc_bi) np.save(os.path.join(args.save_dir, 'test', 'loss_class.npy'), test_loss_class) np.save(os.path.join(args.save_dir, 'test', 'acc_class.npy'), test_acc_class) np.save(os.path.join(args.save_dir, 'test', 'loss_bi_class.npy'), test_loss_bi_class) np.save(os.path.join(args.save_dir, 'test', 'acc_bi_class.npy'), test_acc_bi_class) np.save(os.path.join(args.save_dir, 'test', 'margin_error.npy'), test_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'test', 'margin_error_bi.npy'), test_timeline.margin_error_bi) np.save(os.path.join(args.save_dir, 'test', 'margin_error_class.npy'), test_me_class) np.save(os.path.join(args.save_dir, 'test', 'margin_error_bi_class.npy'), test_me_bi_class) np.save(os.path.join(args.save_dir, 'test', 'auc.npy'), test_timeline.auc) np.save(os.path.join(args.save_dir, 'test', 'fpr_991.npy'), test_timeline.fpr_991) np.save(os.path.join(args.save_dir, 'test', 'fpr_993.npy'), test_timeline.fpr_993) np.save(os.path.join(args.save_dir, 'test', 'fpr_995.npy'), test_timeline.fpr_995) np.save(os.path.join(args.save_dir, 'test', 'fpr_997.npy'), test_timeline.fpr_997) np.save(os.path.join(args.save_dir, 'test', 'fpr_999.npy'), test_timeline.fpr_999) np.save(os.path.join(args.save_dir, 'test', 'fpr_1.npy'), test_timeline.fpr_1) print("other testing details are saved to {}".format(os.path.join(args.save_dir, 'test')))
correct = test(model, test_dataloader) state_keeper.update(time, epoch, loss_dict, correct) save_path = "pretrained/{prefix}.{time}.pth".format(prefix=args.prefix, time=time) torch.save(model.state_dict(), f=save_path) print("Current model has been saved under {}.".format(save_path)) if __name__ == "__main__": state_keeper = utils.StateKeeper(args) if args.exname == "TransferLearning": state_keeper_aux = utils.StateKeeper(args, state_keeper_name="aux") for time in range(args.times): model = utils.get_model(args) optimizer = utils.get_optimizer(args.optim, args.lr, model) forward_epoch(model, train_dataloader, test_dataloader, optimizer, state_keeper, time, args.epochs) if args.exname == "TransferLearning": optimizer_aux = utils.get_optimizer(args.optim, args.lr_aux, model) forward_epoch(model, train_dataloader_aux, test_dataloader_aux, optimizer_aux, state_keeper_aux, time, args.epochs_aux) state_keeper.save() if args.exname == "TransferLearning": state_keeper_aux.save() print("Done!")
def train(args): # set logger logging_dir = args.output_dir if args.output_dir else 'train-{}'.format(utils.get_datetime_string()) os.mkdir('{}'.format(logging_dir)) logging.basicConfig( level=logging.INFO, filename='{}/log.txt'.format(logging_dir), format='%(asctime)s %(message)s', filemode='w' ) console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logging.info('=========== Taks {} started! ==========='.format(args.output_dir)) for arg in vars(args): logging.info('{}: {}'.format(arg, getattr(args, arg))) logging.info('========================================') # initialize loader multi_scale = len(args.layers) if args.network != 'unet' else 0 train_set = utils.SegmentationImageFolder(os.sep.join([args.dataroot, 'train']), image_folder=args.img_dir, segmentation_folder=args.seg_dir, labels=args.color_labels, image_size=(args.image_width, args.image_height), random_horizontal_flip=args.random_horizontal_flip, random_rotation=args.random_rotation, random_crop=args.random_crop, random_square_crop=args.random_square_crop, label_regr=args.regression, multi_scale=multi_scale) val_set = utils.SegmentationImageFolder(os.sep.join([args.dataroot, 'val']), image_folder=args.img_dir, segmentation_folder=args.seg_dir, labels=args.color_labels, image_size=(args.image_width, args.image_height), random_square_crop=args.random_square_crop, label_regr=args.regression) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.val_batch_size) # initialize model, input channels need to be calculated by hand n_classes = len(args.color_labels) if args.network == 'unet': network = networks.UNet criterion = nn.MSELoss() if args.regression else utils.CrossEntropyLoss2D() elif args.network == 'triangle': network = networks.TriangleNet criterion = utils.MSCrossEntropyLoss2D([0.15]+[0.85/float(multi_scale)]*multi_scale) else: pass val_criterion = utils.CrossEntropyLoss2D() if args.regression: model = network(args.layers, 3, 1, groups=args.groups) else: model = network(args.layers, 3, n_classes, groups=args.groups) if not args.cpu: model.cuda() # train iterations = 0 for epoch in range(args.epochs): model.train() # update lr according to lr policy if epoch in args.lr_policy: lr = args.lr_policy[epoch] optimizer = utils.get_optimizer(args.optimizer, model.parameters(), lr=lr, momentum=args.momentum, nesterov=args.nesterov) if epoch > 0: logging.info('| Learning Rate | Epoch: {: >3d} | Change learning rate to {}'.format(epoch+1, lr)) else: logging.info('| Learning Rate | Initial learning rate: {}'.format(lr)) # iterate all samples losses = utils.AverageMeter() for i_batch, (img, seg) in enumerate(train_loader): img = Variable(img) seg = Variable(seg) if not multi_scale else [Variable(x) for x in seg] if not args.cpu: img = img.cuda() seg = seg.cuda() if not multi_scale else [x.cuda() for x in seg] # compute output output = model(img) loss = criterion(output, seg) losses.update(loss.data[0]) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # logging training curve if iterations % args.print_interval == 0: logging.info( '| Iterations: {: >6d} ' '| Epoch: {: >3d}/{: >3d} ' '| Batch: {: >4d}/{: >4d} ' '| Training loss: {:.6f}'.format( iterations, epoch+1, args.epochs, i_batch, len(train_loader)-1, losses.avg ) ) losses = utils.AverageMeter() # validation on all val samples if iterations % args.validation_interval == 0: model.eval() val_losses = utils.AverageMeter() gt_pixel_count = [0] * n_classes pred_pixel_count = [0] * n_classes intersection_pixel_count = [0] * n_classes union_pixel_count = [0] * n_classes for img, seg in val_loader: img = Variable(img) seg = Variable(seg) if not args.cpu: img = img.cuda() seg = seg.cuda() # compute output output = model(img) loss = val_criterion(output, seg) val_losses.update(loss.data[0], float(img.size(0))/float(args.batch_size)) output_numpy = output.data.numpy() if args.cpu else output.data.cpu().numpy() pred_labels = numpy.argmax(output_numpy, axis=1) gt_labels = seg.data.numpy() if args.cpu else seg.data.cpu().numpy() pred_labels = pred_labels.flatten() gt_labels = gt_labels.flatten() for i in range(n_classes): pred_pixel_count[i] += (pred_labels == i).sum() gt_pixel_count[i] += (gt_labels == i).sum() gt_dumb = numpy.full(gt_labels.shape, -1, dtype=numpy.int) pred_dumb = numpy.full(pred_labels.shape, -2, dtype=numpy.int) gt_dumb[gt_labels == i] = 0 pred_dumb[pred_labels == i] = 0 intersection_pixel_count[i] += (gt_dumb == pred_dumb).sum() pred_dumb[gt_labels == i] = 0 union_pixel_count[i] += (pred_dumb == 0).sum() # calculate mPA & mIOU mPA = 0 mIOU = 0 for i in range(n_classes): mPA += float(intersection_pixel_count[i]) / float(gt_pixel_count[i]) mIOU += float(intersection_pixel_count[i]) / float(union_pixel_count[i]) mPA /= float(n_classes) mIOU /= float(n_classes) logging.info( '| Iterations: {: >6d} ' '| Epoch: {: >3d}/{: >3d} ' '| Average mPA: {:.4f} ' '| Average mIOU: {:.4f} ' '| Validation loss: {:.6f} '.format( iterations, epoch+1, args.epochs, mPA, mIOU, val_losses.avg ) ) model.train() if iterations % args.checkpoint_interval == 0 and iterations > 0: model_weights_path = '{}/iterations-{:0>6d}-epoch-{:0>3d}.pth'.format(logging_dir, iterations, epoch+1) torch.save(model.state_dict(), model_weights_path) logging.info('| Checkpoint | {} is saved!'.format(model_weights_path)) iterations += 1
def run(proc_id, n_gpus, args, devices, dataset): dev_id = devices[proc_id] train_labels = dataset.train_labels train_truths = dataset.train_truths num_edges = train_truths.shape[0] reverse_types = { to_etype_name(k): 'rev-' + to_etype_name(k) for k in dataset.possible_rating_values } reverse_types.update({v: k for k, v in reverse_types.items()}) sampler = dgl.dataloading.MultiLayerNeighborSampler([None], return_eids=True) dataloader = dgl.dataloading.EdgeDataLoader(dataset.train_enc_graph, { to_etype_name(k): th.arange( dataset.train_enc_graph.number_of_edges(etype=to_etype_name(k))) for k in dataset.possible_rating_values }, sampler, batch_size=args.minibatch_size, shuffle=True, drop_last=False) if proc_id == 0: valid_dataloader = dgl.dataloading.EdgeDataLoader( dataset.valid_dec_graph, th.arange(dataset.valid_dec_graph.number_of_edges()), sampler, g_sampling=dataset.valid_enc_graph, batch_size=args.minibatch_size, shuffle=False, drop_last=False) test_dataloader = dgl.dataloading.EdgeDataLoader( dataset.test_dec_graph, th.arange(dataset.test_dec_graph.number_of_edges()), sampler, g_sampling=dataset.test_enc_graph, batch_size=args.minibatch_size, shuffle=False, drop_last=False) if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=dev_id) if n_gpus > 0: th.cuda.set_device(dev_id) nd_possible_rating_values = \ th.FloatTensor(dataset.possible_rating_values) nd_possible_rating_values = nd_possible_rating_values.to(dev_id) net = Net(args=args, dev_id=dev_id) net = net.to(dev_id) if n_gpus > 1: net = DistributedDataParallel(net, device_ids=[dev_id], output_device=dev_id) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_epoch = -1 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") dur = [] iter_idx = 1 for epoch in range(1, args.train_max_epoch): if epoch > 1: t0 = time.time() net.train() with tqdm.tqdm(dataloader) as tq: for step, (input_nodes, pair_graph, blocks) in enumerate(tq): head_feat, tail_feat, blocks = load_subtensor( input_nodes, pair_graph, blocks, dataset, dataset.train_enc_graph) frontier = blocks[0] compact_g = flatten_etypes(pair_graph, dataset, 'train').to(dev_id) true_relation_labels = compact_g.edata['label'] true_relation_ratings = compact_g.edata['rating'] head_feat = head_feat.to(dev_id) tail_feat = tail_feat.to(dev_id) frontier = frontier.to(dev_id) pred_ratings = net(compact_g, frontier, head_feat, tail_feat, dataset.possible_rating_values) loss = rating_loss_net(pred_ratings, true_relation_labels.to(dev_id)).mean() count_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if proc_id == 0 and iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) real_pred_ratings = ( th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) rmse = ((real_pred_ratings - true_relation_ratings.to(dev_id))**2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] tq.set_postfix( { 'loss': '{:.4f}'.format(count_loss / iter_idx), 'rmse': '{:.4f}'.format(count_rmse / count_num) }, refresh=False) iter_idx += 1 if epoch > 1: epoch_time = time.time() - t0 print("Epoch {} time {}".format(epoch, epoch_time)) if epoch % args.train_valid_interval == 0: if n_gpus > 1: th.distributed.barrier() if proc_id == 0: valid_rmse = evaluate(args=args, dev_id=dev_id, net=net, dataset=dataset, dataloader=valid_dataloader, segment='valid') logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_epoch = epoch test_rmse = evaluate(args=args, dev_id=dev_id, net=net, dataset=dataset, dataloader=test_dataloader, segment='test') best_test_rmse = test_rmse logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: logging.info("\tChange the LR to %g" % new_lr) learning_rate = new_lr for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 print("Change the LR to %g" % new_lr) # sync on evalution if n_gpus > 1: th.distributed.barrier() print(logging_str) if proc_id == 0: print( 'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_epoch, best_valid_rmse, best_test_rmse))
# Log all train files shutil.copyfile('experiment.ini', experiment_dir + 'experiment.ini') shutil.copyfile('train.py', experiment_dir + 'train.py') shutil.copyfile('flow_utils.py', experiment_dir + 'flow_utils.py') # Log serialized prior if prior_type != 'normal': shutil.copyfile('centers.mat', experiment_dir + 'centers.mat') utils.log(means, experiment_dir + 'means.pkl') utils.log(covariances, experiment_dir + 'covariances.pkl') utils.log(weights, experiment_dir + 'weights.pkl') # Create optimizer scheduler = utils.get_scheduler(lr, lr_schedule) opt_init, opt_update, get_params = utils.get_optimizer( optimizer, scheduler, b1, b2) opt_state = opt_init(params) update_fn = private_update if private else update best_test_params, best_test_loss = None, None pbar = tqdm(pbar_range) for iteration in pbar: batch, X = utils.get_batch(sampling, key, X, minibatch_size, iteration) # Possible with Poisson subsampling if batch.shape[0] == 0: continue # Perform model update temp_key, key = random.split(key)
def main(): # parse arg and start experiment global args best_err1 = 100. best_epoch = 0 args = arg_parser.parse_args() args.config_of_data = config.datasets[args.data] args.num_classes = config.datasets[args.data]['num_classes'] if configure is None: args.tensorboard = False print(Fore.RED + 'WARNING: you don\'t have tesnorboard_logger installed' + Fore.RESET) # optionally resume from a checkpoint if args.resume: if args.resume and os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) old_args = checkpoint['args'] print('Old args:') print(old_args) # set args based on checkpoint if args.start_epoch <= 0: args.start_epoch = checkpoint['epoch'] + 1 best_epoch = args.start_epoch - 1 best_err1 = checkpoint['best_err1'] for name in arch_resume_names: if name in vars(args) and name in vars(old_args): setattr(args, name, getattr(old_args, name)) model = getModel(**vars(args)) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(Fore.RED + args.resume + Fore.RESET), file=sys.stderr) return else: # create model print("=> creating model '{}'".format(args.arch)) model = getModel(**vars(args)) cudnn.benchmark = True # define loss function (criterion) and pptimizer criterion = nn.CrossEntropyLoss().cuda() # define optimizer optimizer = get_optimizer(model, args) Trainer = import_module(args.trainer).Trainer trainer = Trainer(model, criterion, optimizer, args) # create dataloader if args.evaluate == 'train': train_loader, _, _ = getDataloaders(splits=('train'), **vars(args)) trainer.test(train_loader, best_epoch) return elif args.evaluate == 'val': _, val_loader, _ = getDataloaders(splits=('val'), **vars(args)) trainer.test(val_loader, best_epoch) return elif args.evaluate == 'test': _, _, test_loader = getDataloaders(splits=('test'), **vars(args)) trainer.test(test_loader, best_epoch) return else: train_loader, val_loader, _ = getDataloaders(splits=('train', 'val'), **vars(args)) # check if the folder exists create_save_folder(args.save, args.force) # set up logging global log_print, f_log f_log = open(os.path.join(args.save, 'log.txt'), 'w') def log_print(*args): print(*args) print(*args, file=f_log) log_print('args:') log_print(args) print('model:', file=f_log) print(model, file=f_log) log_print('# of params:', str(sum([p.numel() for p in model.parameters()]))) f_log.flush() torch.save(args, os.path.join(args.save, 'args.pth')) scores = [ 'epoch\tlr\ttrain_loss\tval_loss\ttrain_err1' '\tval_err1\ttrain_err5\tval_err' ] if args.tensorboard: configure(args.save, flush_secs=5) for epoch in range(args.start_epoch, args.epochs + 1): # train for one epoch train_loss, train_err1, train_err5, lr = trainer.train( train_loader, epoch) if args.tensorboard: log_value('lr', lr, epoch) log_value('train_loss', train_loss, epoch) log_value('train_err1', train_err1, epoch) log_value('train_err5', train_err5, epoch) # evaluate on validation set val_loss, val_err1, val_err5 = trainer.test(val_loader, epoch) if args.tensorboard: log_value('val_loss', val_loss, epoch) log_value('val_err1', val_err1, epoch) log_value('val_err5', val_err5, epoch) # save scores to a tsv file, rewrite the whole file to prevent # accidental deletion scores.append( ('{}\t{}' + '\t{:.4f}' * 6).format(epoch, lr, train_loss, val_loss, train_err1, val_err1, train_err5, val_err5)) with open(os.path.join(args.save, 'scores.tsv'), 'w') as f: print('\n'.join(scores), file=f) # remember best err@1 and save checkpoint is_best = val_err1 < best_err1 if is_best: best_err1 = val_err1 best_epoch = epoch print(Fore.GREEN + 'Best var_err1 {}'.format(best_err1) + Fore.RESET) # test_loss, test_err1, test_err1 = validate( # test_loader, model, criterion, epoch, True) # save test save_checkpoint( { 'args': args, 'epoch': epoch, 'best_epoch': best_epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_err1': best_err1, }, is_best, args.save) if not is_best and epoch - best_epoch >= args.patience > 0: break print('Best val_err1: {:.4f} at epoch {}'.format(best_err1, best_epoch))
config = utils.load_config(args.config) global_params = config["globals"] utils.set_seed(global_params["seed"]) device = utils.get_device(global_params) output_dir = global_params["output_dir"] data_conf = config["data"] if args.generate: for c in data_conf.values(): utils.generate_data(c) model = models.get_model(config).to() criterion = utils.get_criterion(config) optimizer = utils.get_optimizer(model, config) scheduler = utils.get_scheduler(optimizer, config) loaders = { phase: utils.get_loader(config, phase) for phase in ["train", "valid3", "valid", "valid12"] } runner = SupervisedRunner(device=device, input_key=["objects", "externals", "triplet"], input_target_key="targets") runner.train(model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, scheduler=scheduler, num_epochs=global_params["num_epochs"],
def get_optimizer(self): optimizer_name = self.config[self.network_type]["optimizer"] if optimizer_name == "DistillOptimizer": return U.get_optimizer(optimizer_name)(self.learning_rate, self.momentum, self.max_norm) elif optimizer_name == "Momentum": return U.get_optimizer(optimizer_name)(self.learning_rate, self.momentum)
def main(): print("\n_________________________________________________\n") print(now(), "train_model.py main() running.") parser = argparse.ArgumentParser(description="Deep Thinking") parser.add_argument("--checkpoint", default="check_default", type=str, help="where to save the network") parser.add_argument("--dataset", default="CIFAR10", type=str, help="dataset") parser.add_argument("--depth", default=1, type=int, help="depth of the network") parser.add_argument("--epochs", default=200, type=int, help="number of epochs for training") parser.add_argument("--lr", default=0.1, type=float, help="learning rate") parser.add_argument("--lr_factor", default=0.1, type=float, help="learning rate decay factor") parser.add_argument("--lr_schedule", nargs="+", default=[100, 150], type=int, help="how often to decrease lr") parser.add_argument("--mode", default="default", type=str, help="which testing mode?") parser.add_argument("--model", default="resnet18", type=str, help="model for training") parser.add_argument("--model_path", default=None, type=str, help="where is the model saved?") parser.add_argument("--no_save_log", action="store_true", help="do not save log file") parser.add_argument("--optimizer", default="SGD", type=str, help="optimizer") parser.add_argument("--output", default="output_default", type=str, help="output subdirectory") parser.add_argument("--problem", default="classification", type=str, help="problem type (classification or segmentation)") parser.add_argument("--save_json", action="store_true", help="save json") parser.add_argument("--save_period", default=None, type=int, help="how often to save") parser.add_argument("--test_batch_size", default=50, type=int, help="batch size for testing") parser.add_argument("--test_dataset", type=str, default=None, help="name of the testing dataset") parser.add_argument("--test_iterations", default=None, type=int, help="how many, if testing with a different " "number iterations than training") parser.add_argument("--train_batch_size", default=128, type=int, help="batch size for training") parser.add_argument("--train_log", default="train_log.txt", type=str, help="name of the log file") parser.add_argument("--val_period", default=20, type=int, help="how often to validate") parser.add_argument("--width", default=4, type=int, help="width of the network") args = parser.parse_args() if args.save_period is None: args.save_period = args.epochs print(args) # summary writer train_log = args.train_log try: array_task_id = train_log[:-4].split("_")[-1] except: array_task_id = 1 writer = SummaryWriter(log_dir=f"{args.output}/runs/{train_log[:-4]}") if not args.no_save_log: to_log_file(args, args.output, train_log) # set device device = "cuda" if torch.cuda.is_available() else "cpu" #################################################### # Dataset and Network and Optimizer trainloader, testloader = get_dataloaders( args.dataset, args.train_batch_size, test_batch_size=args.test_batch_size) # load model from path if a path is provided if args.model_path is not None: print(f"Loading model from checkpoint {args.model_path}...") net, start_epoch, optimizer_state_dict = load_model_from_checkpoint( args.model, args.model_path, args.dataset, args.width, args.depth) start_epoch += 1 else: net = get_model(args.model, args.dataset, args.width, args.depth) start_epoch = 0 optimizer_state_dict = None net = net.to(device) pytorch_total_params = sum(p.numel() for p in net.parameters()) optimizer = get_optimizer(args.optimizer, args.model, net, args.lr, args.dataset) print(net) print( f"This {args.model} has {pytorch_total_params/1e6:0.3f} million parameters." ) print(f"Training will start at epoch {start_epoch}.") if optimizer_state_dict is not None: print(f"Loading optimizer from checkpoint {args.model_path}...") optimizer.load_state_dict(optimizer_state_dict) warmup_scheduler = warmup.ExponentialWarmup(optimizer, warmup_period=0) else: warmup_scheduler = warmup.ExponentialWarmup(optimizer, warmup_period=5) lr_scheduler = MultiStepLR(optimizer, milestones=args.lr_schedule, gamma=args.lr_factor, last_epoch=-1) optimizer_obj = OptimizerWithSched(optimizer, lr_scheduler, warmup_scheduler) np.set_printoptions(precision=2) torch.backends.cudnn.benchmark = True test_setup = TestingSetup(args.problem.lower(), args.mode.lower()) #################################################### #################################################### # Train print(f"==> Starting training for {args.epochs - start_epoch} epochs...") for epoch in range(start_epoch, args.epochs): loss, acc = train(net, trainloader, args.problem.lower(), optimizer_obj, device) print(f"{now()} Training loss at epoch {epoch}: {loss}") print(f"{now()} Training accuracy at epoch {epoch}: {acc}") # if the loss is nan, then stop the training if np.isnan(float(loss)): print("Loss is nan, exiting...") sys.exit() # tensorboard loss writing writer.add_scalar("Loss/loss", loss, epoch) writer.add_scalar("Accuracy/acc", acc, epoch) for i in range(len(optimizer.param_groups)): writer.add_scalar(f"Learning_rate/group{i}", optimizer.param_groups[i]["lr"], epoch) if (epoch + 1) % args.val_period == 0: train_acc = test(net, trainloader, test_setup, device) test_acc = test(net, testloader, test_setup, device) print(f"{now()} Training accuracy: {train_acc}") print(f"{now()} Testing accuracy: {test_acc}") stats = [train_acc, test_acc] stat_names = ["train_acc", "test_acc"] for stat_idx, stat in enumerate(stats): stat_name = os.path.join("val", stat_names[stat_idx]) writer.add_scalar(stat_name, stat, epoch) if (epoch + 1) % args.save_period == 0 or (epoch + 1) == args.epochs: state = { "net": net.state_dict(), "epoch": epoch, "optimizer": optimizer.state_dict() } out_str = os.path.join( args.checkpoint, f"{args.model}_{args.dataset}_{args.optimizer}" f"_depth={args.depth}" f"_width={args.width}" f"_lr={args.lr}" f"_batchsize={args.train_batch_size}" f"_epoch={args.epochs-1}" f"_{array_task_id}.pth") print("saving model to: ", args.checkpoint, " out_str: ", out_str) if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) torch.save(state, out_str) writer.flush() writer.close() #################################################### #################################################### # Test print("==> Starting testing...") if args.test_iterations is not None: assert isinstance( net.iters, int), "Cannot test feed-forward model with iterations." net.iters = args.test_iterations train_acc = test(net, trainloader, test_setup, device) test_acc = test(net, testloader, test_setup, device) print(f"{now()} Training accuracy: {train_acc}") print(f"{now()} Testing accuracy: {test_acc}") model_name_str = f"{args.model}_depth={args.depth}_width={args.width}" stats = OrderedDict([("model", model_name_str), ("num_params", pytorch_total_params), ("learning rate", args.lr), ("lr_factor", args.lr_factor), ("lr", args.lr), ("epochs", args.epochs), ("train_batch_size", args.train_batch_size), ("optimizer", args.optimizer), ("dataset", args.dataset), ("train_acc", train_acc), ("test_acc", test_acc), ("test_iter", args.test_iterations)]) if args.save_json: to_json(stats, args.output)
def main(): # parse arg and start experiment global args best_ap = -1. best_iter = 0 args = parser.parse_args() args.config_of_data = config.datasets[args.data] # args.num_classes = config.datasets[args.data]['num_classes'] if configure is None: args.tensorboard = False print(Fore.RED + 'WARNING: you don\'t have tesnorboard_logger installed' + Fore.RESET) # optionally resume from a checkpoint if args.resume: if args.resume and os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) old_args = checkpoint['args'] print('Old args:') print(old_args) # set args based on checkpoint if args.start_iter <= 0: args.start_iter = checkpoint['iter'] + 1 best_iter = args.start_iter - 1 best_ap = checkpoint['best_ap'] for name in arch_resume_names: if name in vars(args) and name in vars(old_args): setattr(args, name, getattr(old_args, name)) model = get_model(**vars(args)) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (iter {})" .format(args.resume, checkpoint['iter'])) else: print( "=> no checkpoint found at '{}'".format( Fore.RED + args.resume + Fore.RESET), file=sys.stderr) return else: # create model print("=> creating model '{}'".format(args.arch)) model = get_model(**vars(args)) # cudnn.benchmark = True cudnn.enabled = False # create dataloader if args.evaluate == 'val': train_loader, val_loader, test_loader = getDataloaders( splits=('val'), **vars(args)) validate(val_loader, model, best_iter) return elif args.evaluate == 'test': train_loader, val_loader, test_loader = getDataloaders( splits=('test'), **vars(args)) validate(test_loader, model, best_iter) return else: train_loader, val_loader, test_loader = getDataloaders( splits=('train', 'val'), **vars(args)) # define optimizer optimizer = get_optimizer(model, args) # check if the folder exists if os.path.exists(args.save): print(Fore.RED + args.save + Fore.RESET + ' already exists!', file=sys.stderr) if not args.force: ans = input('Do you want to overwrite it? [y/N]:') if ans not in ('y', 'Y', 'yes', 'Yes'): os.exit(1) print('remove existing ' + args.save) shutil.rmtree(args.save) os.makedirs(args.save) print('create folder: ' + Fore.GREEN + args.save + Fore.RESET) # copy code to save folder if args.save.find('debug') < 0: shutil.copytree( '.', os.path.join( args.save, 'src'), symlinks=True, ignore=shutil.ignore_patterns( '*.pyc', '__pycache__', '*.path.tar', '*.pth', '*.ipynb', '.*', 'data', 'save', 'save_backup')) # set up logging global log_print, f_log f_log = open(os.path.join(args.save, 'log.txt'), 'w') def log_print(*args): print(*args) print(*args, file=f_log) log_print('args:') log_print(args) print('model:', file=f_log) print(model, file=f_log, flush=True) # log_print('model:') # log_print(model) # log_print('optimizer:') # log_print(vars(optimizer)) log_print('# of params:', str(sum([p.numel() for p in model.parameters()]))) torch.save(args, os.path.join(args.save, 'args.pth')) scores = ['iter\tlr\ttrain_loss\tval_ap'] if args.tensorboard: configure(args.save, flush_secs=5) for i in range(args.start_iter, args.niters + 1, args.eval_freq): # print('iter {:3d} lr = {:.6e}'.format(i, lr)) # if args.tensorboard: # log_value('lr', lr, i) # train for args.eval_freq iterations train_loss = train(train_loader, model, optimizer, i, args.eval_freq) i += args.eval_freq - 1 # evaluate on validation set val_ap = validate(val_loader, model, i) # save scores to a tsv file, rewrite the whole file to prevent # accidental deletion scores.append(('{}\t{}' + '\t{:.4f}' * 2) .format(i, lr, train_loss, val_ap)) with open(os.path.join(args.save, 'scores.tsv'), 'w') as f: print('\n'.join(scores), file=f) # remember best err@1 and save checkpoint # TODO: change this is_best = val_ap > best_ap if is_best: best_ap = val_ap best_iter = i print(Fore.GREEN + 'Best var_err1 {}'.format(best_ap) + Fore.RESET) save_checkpoint({ 'args': args, 'iter': i, 'best_iter': best_iter, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_ap': best_ap, }, is_best, args.save) if not is_best and i - best_iter >= args.patience > 0: break print('Best val_ap: {:.4f} at iter {}'.format(best_ap, best_iter))
api_key=api_key, project_name="sim_real", auto_metric_logging=True, auto_param_logging=True, ) if args.mixed_precision: print("Applied: Mixed Precision") tf.keras.mixed_precision.set_global_policy("mixed_float16") train_ds, test_ds = get_dataset(args) grid = image_grid(next(iter(train_ds))[0])[0] logger.log_image(grid.numpy()) model = get_model(args) criterion = get_criterion(args) optimizer = get_optimizer(args) lr_scheduler = get_lr_scheduler(args) early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=args.patience, restore_best_weights=True) experiment_name = get_experiment_name(args) logger.set_name(experiment_name) logger.log_parameters(vars(args)) with logger.train(): filename =f'{args.model_name}.hdf5' checkpoint = tf.keras.callbacks.ModelCheckpoint(filename, monitor='val_accuracy', mode='max', save_best_only=True, verbose=True) model.compile(loss=criterion, optimizer=optimizer, metrics=['accuracy']) if args.dry_run: print("[INFO] Turn off all callbacks") model.fit(train_ds, validation_data=test_ds, epochs=args.epochs, steps_per_epoch=2) else: model.fit(train_ds, validation_data=test_ds, epochs=args.epochs, callbacks=[lr_scheduler, early_stop, checkpoint])
def __init__(self, field_sizes=None, embed_size=10, filter_sizes=None, layer_acts=None, drop_out=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None): Model.__init__(self) init_vars = [] num_inputs = len(field_sizes) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype)) init_vars.append(('f1', [embed_size, filter_sizes[0], 1, 2], 'xavier', dtype)) init_vars.append(('f2', [embed_size, filter_sizes[1], 2, 2], 'xavier', dtype)) init_vars.append(('w1', [2 * 3 * embed_size, 1], 'xavier', dtype)) init_vars.append(('b1', [1], 'zero', dtype)) print('init_vars: ', init_vars) self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1) l = xw l = tf.transpose(tf.reshape(l, [-1, num_inputs, embed_size, 1]), [0, 2, 1, 3]) # 变为 16 x 10 矩阵 f1 = self.vars['f1'] l = tf.nn.conv2d(l, f1, [1, 1, 1, 1], 'SAME') l = tf.transpose( utils.max_pool_4d( tf.transpose(l, [0, 1, 3, 2]), int(num_inputs / 2)), [0, 1, 3, 2]) f2 = self.vars['f2'] l = tf.nn.conv2d(l, f2, [1, 1, 1, 1], 'SAME') l = tf.transpose( utils.max_pool_4d( tf.transpose(l, [0, 1, 3, 2]), 3), [0, 1, 3, 2]) l = tf.nn.dropout( utils.activate( tf.reshape(l, [-1, embed_size * 3 * 2]), layer_acts[0]), self.layer_keeps[0]) w1 = self.vars['w1'] b1 = self.vars['b1'] l = tf.matmul(l, w1) + b1 l = tf.squeeze(l) self.y_prob = tf.sigmoid(l) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y)) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
input_vector = concatenate([q_vector, d_vector]) print("Concatenated vector: {iv}".format(iv=input_vector)) dense = Dense(config_model_param["layers_size"][0], activation=config_model_param['hidden_activation'], name="MLP_combine_0")(input_vector) i = 0 for i in range(config_model_param["num_layers"]-2): # dense = Dropout(0.25)(dense) dense = Dense(config_model_param["layers_size"][i+1], activation=config_model_param['hidden_activation'], name="MLP_combine_"+str(i+1))(dense) # dense = Dropout(0.5)(dense) out_size = get_input_label_size(config_data) out_labels = Dense(out_size, activation=config_model_param['output_activation'], name="MLP_out")(dense) model = Model(inputs=[query, doc], outputs=out_labels) model2 = Model(inputs=[query, doc], outputs=input_vector) optimizer = get_optimizer(config_model_param["optimizer"])(lr=config_model_param["learning_rate"]) print(optimizer) model.compile(optimizer=optimizer, loss=config_model_train["loss_function"], metrics=config_model_train["metrics"]) print(model.summary()) plot_model(model, to_file=join(config_model_train["train_details"], config_model_param['model_name']+".png")) # save model and resume # serialize model to JSON model_json = model.to_json() with open(join(config_model_train["train_details"], config_model_param["model_name"] + ".json"), "w") as json_file: json_file.write(model_json) print("Saved model to disk.") print("Reading training data:") print("[First]:\nRead label files to relations...") relations, relation_labeler = read_lablers_to_relations(config_data["labels"])
def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None, embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None, layer_norm=True): Model.__init__(self) init_vars = [] num_inputs = len(field_sizes) for i in range(num_inputs): init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype)) node_in = num_inputs * embed_size + embed_size * embed_size for i in range(len(layer_sizes)): init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype)) init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype)) node_in = layer_sizes[i] self.graph = tf.Graph() with self.graph.as_default(): if random_seed is not None: tf.set_random_seed(random_seed) self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)] self.y = tf.placeholder(dtype) self.keep_prob_train = 1 - np.array(drop_out) self.keep_prob_test = np.ones_like(drop_out) self.layer_keeps = tf.placeholder(dtype) self.vars = utils.init_var_map(init_vars, init_path) w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)] xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1) z = tf.reduce_sum(tf.reshape(xw, [-1, num_inputs, embed_size]), 1) op = tf.reshape( tf.matmul(tf.reshape(z, [-1, embed_size, 1]), tf.reshape(z, [-1, 1, embed_size])), [-1, embed_size * embed_size]) if layer_norm: # x_mean, x_var = tf.nn.moments(xw, [1], keep_dims=True) # xw = (xw - x_mean) / tf.sqrt(x_var) # x_g = tf.Variable(tf.ones([num_inputs * embed_size]), name='x_g') # x_b = tf.Variable(tf.zeros([num_inputs * embed_size]), name='x_b') # x_g = tf.Print(x_g, [x_g[:10], x_b]) # xw = xw * x_g + x_b p_mean, p_var = tf.nn.moments(op, [1], keep_dims=True) op = (op - p_mean) / tf.sqrt(p_var) p_g = tf.Variable(tf.ones([embed_size**2]), name='p_g') p_b = tf.Variable(tf.zeros([embed_size**2]), name='p_b') # p_g = tf.Print(p_g, [p_g[:10], p_b]) op = op * p_g + p_b l = tf.concat([xw, op], 1) for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] bi = self.vars['b%d' % i] l = tf.nn.dropout( utils.activate( tf.matmul(l, wi) + bi, layer_acts[i]), self.layer_keeps[i]) l = tf.squeeze(l) self.y_prob = tf.sigmoid(l) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y)) if layer_l2 is not None: self.loss += embed_l2 * tf.nn.l2_loss(tf.concat(w0, 0)) for i in range(len(layer_sizes)): wi = self.vars['w%d' % i] self.loss += layer_l2[i] * tf.nn.l2_loss(wi) self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def vae_estimator(hparams): # Get a session sess = tf.Session() # Set up palceholders A = tf.placeholder(tf.float32, shape=(hparams.n_input, hparams.num_measurements), name='A') y_batch = tf.placeholder(tf.float32, shape=(hparams.batch_size, hparams.num_measurements), name='y_batch') # Create the generator # TODO: Move z_batch definition here z_batch, x_hat_batch, restore_path, restore_dict = mnist_model_def.vae_gen( hparams) # measure the estimate if hparams.measurement_type == 'project': y_hat_batch = tf.identity(x_hat_batch, name='y_hat_batch') else: y_hat_batch = tf.abs(tf.matmul(x_hat_batch, A, name='y_hat_batch')) # define all losses m_loss1_batch = tf.reduce_mean(tf.abs(y_batch - y_hat_batch), 1) m_loss2_batch = tf.reduce_mean((y_batch - y_hat_batch)**2, 1) zp_loss_batch = tf.reduce_sum(z_batch**2, 1) # define total loss total_loss_batch = hparams.mloss1_weight * m_loss1_batch \ + hparams.mloss2_weight * m_loss2_batch \ + hparams.zprior_weight * zp_loss_batch total_loss = tf.reduce_mean(total_loss_batch) # Compute means for logging m_loss1 = tf.reduce_mean(m_loss1_batch) m_loss2 = tf.reduce_mean(m_loss2_batch) zp_loss = tf.reduce_mean(zp_loss_batch) # Set up gradient descent var_list = [z_batch] global_step = tf.Variable(0, trainable=False, name='global_step') learning_rate = utils.get_learning_rate(global_step, hparams) opt = utils.get_optimizer(learning_rate, hparams) update_op = opt.minimize(total_loss, var_list=var_list, global_step=global_step, name='update_op') opt_reinit_op = utils.get_opt_reinit_op(opt, var_list, global_step) # Intialize and restore model parameters init_op = tf.global_variables_initializer() sess.run(init_op) restorer = tf.train.Saver(var_list=restore_dict) restorer.restore(sess, restore_path) def estimator(A_val, y_batch_val, z_batch_val, hparams): """Function that returns the estimated image""" best_keeper = utils.BestKeeper(hparams) assign_z_opt_op = z_batch.assign(z_batch_val) if hparams.measurement_type == 'project': feed_dict = {y_batch: y_batch_val} else: feed_dict = {A: A_val, y_batch: y_batch_val} for i in range(hparams.num_random_restarts): sess.run(opt_reinit_op) sess.run(assign_z_opt_op) for j in range(hparams.max_update_iter): _, lr_val, total_loss_val, \ m_loss1_val, m_loss2_val, zp_loss_val, z_batch_val = sess.run([update_op, learning_rate, total_loss, m_loss1, m_loss2, zp_loss, z_batch], feed_dict=feed_dict) logging_format = 'rr {} iter {} lr {} total_loss {} m_loss1 {} m_loss2 {} zp_loss {}' print logging_format.format(i, j, lr_val, total_loss_val, m_loss1_val, m_loss2_val, zp_loss_val) if hparams.gif and ((j % hparams.gif_iter) == 0): images = sess.run(x_hat_batch, feed_dict=feed_dict) for im_num, image in enumerate(images): save_dir = '{0}/{1}/'.format(hparams.gif_dir, im_num) utils.set_up_dir(save_dir) save_path = save_dir + '{0}.png'.format(j) image = image.reshape(hparams.image_shape) save_image(image, save_path) x_hat_batch_val, z_batch_val, total_loss_batch_val = sess.run( [x_hat_batch, z_batch, total_loss_batch], feed_dict=feed_dict) best_keeper.report(x_hat_batch_val, z_batch_val, total_loss_batch_val) return best_keeper.get_best() return estimator
def train_supernet(results_dir, model, task_sampler, train_iter, valid_iter, device, config): """ :param results_dir: :param model: :param task_sampler: :param train_iter: :param valid_iter: :param device: :param config: :return: """ writer = None since = time.time() seed = set_seed(config["TRAIN"]["train_seed"]) config["TRAIN"]["train_seed"] = seed with open(os.path.join(results_dir, "config.yaml"), "w") as f: yaml.dump(config, f) # metrics total_metrics = { "train": [], "valid": [], } # data iterators iters = {"train": train_iter, "valid": valid_iter} # training stuff optimizer = get_optimizer(model.parameters(), config["TRAIN"]["OPTIMIZER"]) scheduler = get_scheduler(optimizer, config["TRAIN"]["SCHEDULER"]) criterion = get_criterion(config["TRAIN"]["CRITERION"]) # training for epoch in range(config["TRAIN"]["num_epochs"]): print("-" * 100) print("Iter Epoch {}/{}".format(epoch + 1, config["TRAIN"]["num_epochs"])) print("-" * 100) epoch_metrics = { "train": { "learning_rate": [], "losses_train": [], "accs_train": [], }, "valid": { "losses_valid": [], "accs_valid": [], } } for phase in ["train", "valid"]: for iter_cpt, (x, y) in tqdm(enumerate(iters[phase]), ncols=100, total=len(iters[phase])): # perform an update if phase == "train": model.train() tasks = task_sampler.sample(n_monte=config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"]) loss_t = None accs_t = [] for task in tasks: # forward x_t, y_t = x.to(device), y.to(device) preds_t = model.forward(x_t, task) # computing gradient if loss_t is None: loss_t = criterion(preds_t, y_t) / config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"] else: loss_t += criterion(preds_t, y_t) / config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"] # saving accuracies accs_t.append(np.mean((torch.max(preds_t, dim=1)[1] == y_t).cpu().numpy())) # update loss_t.backward() optimizer.step() scheduler.step(epoch) model.none_grad() # adding metrics epoch_metrics[phase]["learning_rate"].append(scheduler.get_lr()) epoch_metrics[phase]["losses_train"].append(loss_t.item()) epoch_metrics[phase]["accs_train"].append(np.mean(accs_t)) elif config["TRAIN"]["perform_valid"]: model.eval() task = task_sampler.sample()[0] # forward x_v, y_v = x.to(device), y.to(device) with torch.no_grad(): preds_v = model.forward(x_v, task) loss_v = criterion(preds_v, y_v) # adding metrics epoch_metrics[phase]["losses_valid"].append(loss_v.item()) epoch_metrics[phase]["accs_valid"].append( np.mean((torch.max(preds_v, dim=1)[1] == y_v).cpu().numpy())) else: break # average metrics over epoch to_print = "\n" for phase in ["train", "valid"]: to_print += phase.upper() + ":\n" for key in epoch_metrics[phase].keys(): if len(epoch_metrics[phase][key]) > 0: epoch_metrics[phase][key] = np.mean(epoch_metrics[phase][key]) to_print += "{}: {:.4f}".format(key, epoch_metrics[phase][key]) + "\n" else: epoch_metrics[phase][key] = None total_metrics[phase].append(epoch_metrics[phase]) to_print += "\n" # tensorboard integration to plot nice curves if config["TRAIN"]["use_tensorboard"]: if config["TRAIN"]["use_tensorboard"] and writer is None: writer = SummaryWriter(results_dir) for phase in ["train", "valid"]: for key, value in epoch_metrics[phase].items(): if value is not None: writer.add_scalar(phase + "/" + key, value, epoch) time_elapsed = time.time() - since print(to_print + "Time Elapsed: {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) # save everything if config["TRAIN"]["save"] and ((epoch + 1) % config["TRAIN"]["save_period"] == 0): # saving model weights_path = os.path.join(results_dir, "model_weights_epoch_{0}_of_{1}.pth". format(epoch + 1, config["TRAIN"]["num_epochs"])) torch.save(model.state_dict(), weights_path) # saving stuff to retrieve with open(os.path.join(results_dir, "total_metrics.pkl"), "wb") as handle: pickle.dump(total_metrics, handle, protocol=pickle.HIGHEST_PROTOCOL) time_elapsed = time.time() - since print("Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) return total_metrics
def __init__(self, opt): self.opt = opt self.device = torch.device("cuda" if opt.ngpu else "cpu") self.model, self.classifier = models.get_model(opt.net_type, opt.classifier_type, opt.pretrained, int(opt.nclasses)) self.model = self.model.to(self.device) self.classifier = self.classifier.to(self.device) if opt.ngpu > 1: self.model = nn.DataParallel(self.model) self.loss = models.init_loss(opt.loss_type) self.loss = self.loss.to(self.device) self.optimizer = utils.get_optimizer(self.model, self.opt) self.lr_scheduler = utils.get_lr_scheduler(self.opt, self.optimizer) self.alpha_scheduler = utils.get_margin_alpha_scheduler(self.opt) self.train_loader = datasets.generate_loader(opt, 'train') self.test_loader = datasets.generate_loader(opt, 'val') self.epoch = 0 self.best_epoch = False self.training = False self.state = {} self.train_loss = utils.AverageMeter() self.test_loss = utils.AverageMeter() self.batch_time = utils.AverageMeter() self.test_metrics = utils.ROCMeter() self.best_test_loss = utils.AverageMeter() self.best_test_loss.update(np.array([np.inf])) self.visdom_log_file = os.path.join(self.opt.out_path, 'log_files', 'visdom.log') self.vis = Visdom(port=opt.visdom_port, log_to_filename=self.visdom_log_file, env=opt.exp_name + '_' + str(opt.fold)) self.vis_loss_opts = { 'xlabel': 'epoch', 'ylabel': 'loss', 'title': 'losses', 'legend': ['train_loss', 'val_loss'] } self.vis_tpr_opts = { 'xlabel': 'epoch', 'ylabel': 'tpr', 'title': 'val_tpr', 'legend': ['tpr@fpr10-2', 'tpr@fpr10-3', 'tpr@fpr10-4'] } self.vis_epochloss_opts = { 'xlabel': 'epoch', 'ylabel': 'loss', 'title': 'epoch_losses', 'legend': ['train_loss', 'val_loss'] }