def prepare_dirs(self, hparams): dataset_desc = 'NER' self.config.model_name = "{}_{}".format(dataset_desc, get_time()) self.config.model_dir = os.path.join(self.config.log_dir, self.config.model_name) for path in [self.config.log_dir, self.config.model_dir]: if not os.path.exists(path): os.makedirs(path) save_hparams(self.config.model_dir, hparams) copy_file("hparams.py", os.path.join(self.config.model_dir, "hparams.py"))
def process_or_load_hparams(out_dir, default_hparams, hparams_path): hparams = default_hparams # if a Hparams path is given as argument, override the default_hparams. hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path) # extend HParams to add some parameters necessary for the training. hparams = process_hparams(hparams) # check compatibility of HParams check_hparams(hparams) # Save HParams utils.save_hparams(out_dir, hparams) # Print HParams print("Print hyperparameters:") utils.print_hparams(hparams) return hparams
def _external_eval(model, global_step, sess, hparams, iterator, iterator_feed_dict, tgt_file, lbl_file, label, summary_writer, save_on_best): """External evaluation such as BLEU and ROUGE scores.""" out_dir = hparams.out_dir decode = global_step > 0 if decode: utils.print_out("# External evaluation, global step %d" % global_step) sess.run(iterator.initializer, feed_dict=iterator_feed_dict) slot_output = os.path.join(out_dir, "slot_output_%s" % label) intent_output = os.path.join(out_dir, "intent_output_%s" % label) scores = nmt_utils.decode_and_evaluate( label, model, sess, slot_output, intent_output, ref_file=tgt_file, ref_lbl_file=lbl_file, metrics=hparams.metrics, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, task=hparams.task, decode=decode, infer_mode=hparams.infer_mode) # Save on best metrics if decode: for metric in hparams.metrics: best_metric_label = "best_" + metric utils.add_summary(summary_writer, global_step, "%s_%s" % (label, metric), scores[metric]) # metric: larger is better if save_on_best and scores[metric] > getattr( hparams, best_metric_label): setattr(hparams, best_metric_label, scores[metric]) model.saver.save(sess, os.path.join( getattr(hparams, best_metric_label + "_dir"), "translate.ckpt"), global_step=model.global_step) utils.save_hparams(out_dir, hparams) return scores
def create_or_load_hparams(out_dir, default_hparams, flags): """Create hparams or load hparams from out_dir.""" hparams = utils.load_hparams(out_dir) if not hparams: hparams = default_hparams hparams = utils.maybe_parse_standard_hparams(hparams, flags.hparams_path) hparams.add_hparam("x_dim", hparams.img_width * hparams.img_height) else: hparams = utils.ensure_compatible_hparams(hparams, default_hparams, flags) # Save HParams utils.save_hparams(out_dir, hparams) # Print HParams utils.print_hparams(hparams) return hparams
def create_or_load_hparams(out_dir, default_hparams, flags): # if the out_dir already contains hparams file, load these hparams. hparams = utils.load_hparams(out_dir) if not hparams: hparams = default_hparams hparams = utils.maybe_parse_standard_hparams(hparams, flags.hparams_path) hparams = extend_hparams(hparams) else: #ensure that the loaded hparams and the command line hparams are compatible. If not, the command line hparams are overwritten! hparams = utils.ensure_compatible_hparams(hparams, default_hparams, flags) # Save HParams utils.save_hparams(out_dir, hparams) # Print HParams print("Print hyperparameters:") utils.print_hparams(hparams) return hparams
def create_or_load_hparams( out_dir, default_hparams, hparams_path, save_hparams=True): """Create hparams or load hparams from out_dir.""" hparams = utils.load_hparams(out_dir) if not hparams: hparams = default_hparams hparams = utils.maybe_parse_standard_hparams( hparams, hparams_path) else: hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path) hparams = extend_hparams(hparams) # Save HParams if save_hparams: utils.save_hparams(out_dir, hparams) for metric in hparams.metrics: utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams) # Print HParams utils.print_hparams(hparams) return hparams
def initialize(self, logdir=None, coolname=False, hparams=None, tensorboard=False, no_timestamp=False, global_rank=0, eager_flush=True): ''' Initialize logx inputs - logdir - where to write logfiles - tensorboard - whether to write to tensorboard file - global_rank - must set this if using distributed training, so we only log from rank 0 - coolname - generate a unique directory name underneath logdir, else use logdir as output directory - hparams - only use if not launching jobs with runx, which also saves the hparams. - eager_flush - call `flush` after every tensorboard write ''' self.rank0 = (global_rank == 0) self.initialized = True if logdir is not None: self.logdir = logdir else: logroot = get_logroot() if coolname: from coolname import generate_slug self.logdir = os.path.join(logroot, generate_slug(2)) else: self.logdir = os.path.join(logroot, 'default') # confirm target log directory exists if not os.path.isdir(self.logdir): os.makedirs(self.logdir, exist_ok=True) if hparams is not None and self.rank0: save_hparams(hparams, self.logdir) # Tensorboard file if self.rank0 and tensorboard: self.tb_writer = SummaryWriter(log_dir=self.logdir, flush_secs=1) else: self.tb_writer = None self.eager_flush = eager_flush # This allows us to use the tensorboard with automatic checking of both # the `tensorboard` condition, as well as ensuring writes only happen # on rank0. Any function supported by `SummaryWriter` is supported by # `ConditionalProxy`. Additionally, flush will be called after any call # to this. self.tensorboard = ConditionalProxy( self.tb_writer, tensorboard and self.rank0, post_hook=self._flush_tensorboard, ) if not self.rank0: return # Metrics file metrics_fn = os.path.join(self.logdir, 'metrics.csv') self.metrics_fp = open(metrics_fn, mode='a+') self.metrics_writer = csv.writer(self.metrics_fp, delimiter=',') # Log file log_fn = os.path.join(self.logdir, 'logging.log') self.log_file = open(log_fn, mode='a+') # save metric self.save_metric = None self.best_metric = None self.save_ckpt_fn = '' # Find the existing best checkpoint, and update `best_metric`, # if available self.best_ckpt_fn = self.get_best_checkpoint() or '' if self.best_ckpt_fn: best_chk = torch.load(self.best_ckpt_fn, map_location='cpu') self.best_metric = best_chk.get('__metric', None) self.epoch = defaultdict(lambda: 0) self.no_timestamp = no_timestamp # Initial timestamp, so that epoch time calculation is correct phase = 'start' csv_line = [phase] # add epoch/iter csv_line.append('{}/step'.format(phase)) csv_line.append(0) # add timestamp if not self.no_timestamp: # this feature is useful for testing csv_line.append('timestamp') csv_line.append(time.time()) self.metrics_writer.writerow(csv_line) self.metrics_fp.flush()
from calc_rouge import calc_rouge import os from hparams import Hparams import math import logging logging.basicConfig(level=logging.INFO) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" logging.info("# hparams") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.modeldir) logging.info("# Prepare train/eval batches") train_batches, num_train_batches, num_train_samples = get_batch(hp.train_source, hp.train_target, hp.maxlen_source, hp.maxlen_target, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval_source, hp.eval_target, hp.maxlen_source, hp.maxlen_target, hp.vocab, hp.eval_batch_size, shuffle=False) # create a iterator of the correct shape and type iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next()
def train(conf, project_dir: Path, run_dir: Path) -> torch.nn.Module: writer = SummaryWriter(str(run_dir)) save_hparams(conf, writer) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') if conf.dataset == 'sine': # dataset of time-series dataset_train = SineData() dataloader_train = DataLoader(dataset_train, batch_size=conf.batch_size, drop_last=True) dataset_test = SineData() dataloader_test = DataLoader(dataset_test, batch_size=conf.batch_size, shuffle=False, drop_last=True) h_sizes = OmegaConf.to_container( conf.hidden_sizes) # OmegaConf object to list model = MLPModel(dim_y=1, dim_r=conf.dim_r, dim_z_prime=conf.dim_z_prime, dim_l=conf.dim_l, hidden_sizes_encoder=h_sizes, hidden_sizes_ode_net=h_sizes, hidden_sizes_decoder=h_sizes, t0=dataset_train.t0, device=device) elif conf.dataset == 'sinefreq': # dataset of frequency varying sinus time-series dataset_train = FreqSineData(amplitude_range=(0.5, 1.), shift_range=(-.5, .5), freq_range=(1.0, 2.0), num_samples=5000) dataloader_train = DataLoader(dataset_train, batch_size=conf.batch_size, drop_last=True) dataset_test = FreqSineData(amplitude_range=(0.5, 1.), shift_range=(-.5, .5), freq_range=(1.0, 2.0), num_samples=1000) dataloader_test = DataLoader(dataset_test, batch_size=conf.batch_size, shuffle=False, drop_last=True) h_sizes = OmegaConf.to_container( conf.hidden_sizes) # OmegaConf object to list model = MLPModel(dim_y=1, dim_r=conf.dim_r, dim_z_prime=conf.dim_z_prime, dim_l=conf.dim_l, hidden_sizes_encoder=h_sizes, hidden_sizes_ode_net=h_sizes, hidden_sizes_decoder=h_sizes, t0=dataset_train.t0, device=device) elif conf.dataset == 'noisysine': sigma = conf.sigma # dataset of noisy sinus time-series dataset_train = NoisySineData(sigma, shift_range=(-0.1, .1), freq_range=(1.9, 2.0), num_samples=1000) dataloader_train = DataLoader(dataset_train, batch_size=conf.batch_size, drop_last=True) dataset_test = NoisySineData(sigma, shift_range=(-0.1, .1), freq_range=(1.9, 2.0), num_samples=1000) dataloader_test = DataLoader(dataset_test, batch_size=conf.batch_size, shuffle=False, drop_last=True) h_sizes = OmegaConf.to_container( conf.hidden_sizes) # OmegaConf object to list model = MLPModel(dim_y=1, dim_r=conf.dim_r, dim_z_prime=conf.dim_z_prime, dim_l=conf.dim_l, hidden_sizes_encoder=h_sizes, hidden_sizes_ode_net=h_sizes, hidden_sizes_decoder=h_sizes, t0=dataset_train.t0, device=device) elif conf.dataset == 'rotnist': # dataset of Rotating MNIST (in the literature) dataset_mnist = RotNISTDataset(data_dir=str(project_dir / 'data')) len_test = 10 dataset_train = dataset_mnist[:len(dataset_mnist) - len_test] dataset_test = dataset_mnist[len(dataset_mnist) - len_test:] dataloader_train = DataLoader(dataset_train, batch_size=conf.batch_size, drop_last=True) dataloader_test = DataLoader(dataset_test, batch_size=conf.batch_size, shuffle=False, drop_last=True) h_sizes = OmegaConf.to_container(conf.hidden_sizes) model = ConvNetModel(dim_r=conf.dim_r, dim_z_prime=conf.dim_z_prime, dim_l=conf.dim_l, hidden_sizes_ode_net=h_sizes, t0=dataset_mnist.t0, device=device) else: raise ValueError(f'Dataset {conf.dataset} not recognized') model = model.to(device) optimizer = torch.optim.RMSprop(model.parameters(), lr=conf.lr) context_range = OmegaConf.to_container(conf.context_range) extra_target_range = OmegaConf.to_container(conf.extra_target_range) global_train_step = 0 global_test_step = 0 for epoch in tqdm(range(conf.epochs)): mse_train_list = [] mse_test_list = [] with torch.no_grad(): for step, (t, y) in enumerate(dataloader_test): t, y = t.to(device), y.to(device) t_context, y_context, t_extra, y_extra, _, _ = get_split( t, y, test_context_size=conf.test_context_size) p_y, _, _ = model( t_context, y_context, t_extra ) # for testing, we only need predictions at t_extra output = p_y.loc mse_test = F.mse_loss(output, y_extra) # log test results writer.add_scalar('mse_test', mse_test.item(), global_test_step) mse_test_list.append(mse_test.item()) if step == 0 and epoch % 2 == 0: if conf.dataset in ['sine', 'sinefreq', 'noisysine']: log_sine_plot(writer, model, t, y, t_context, y_context, t_extra, epoch) elif conf.dataset == 'rotnist': log_rotnist_plot2(writer, model, t, y, epoch, 'test') global_test_step += 1 for (t, y) in dataloader_train: t, y = t.to(device), y.to(device) (t_context, y_context, t_extra, y_extra, t_target, y_target) = get_split(t, y, context_range=context_range, extra_target_range=extra_target_range) p_y, q_z_T, q_z_C = model(t_context, y_context, t_target, y_target=y_target) log_p = p_y.log_prob(y_target).sum(dim=(1, 2)).mean( dim=0) # mean on batch dim, sum on time dim/y dim output = p_y.loc mse_train = F.mse_loss(output, y_target) # mean on batch dim, sum on z dim (equivalent to kl_div of the multivariate normal) kl_div = kl_divergence(q_z_C, q_z_T).sum(dim=1).mean(dim=0) loss = -log_p + kl_div optimizer.zero_grad() loss.backward() optimizer.step() # log training metrics writer.add_scalar('kl_div', kl_div.item(), global_train_step) writer.add_scalar('log_p', log_p.item(), global_train_step) writer.add_scalar('train_loss', loss.item(), global_train_step) writer.add_scalar('mse_train', mse_train.item(), global_train_step) mse_train_list.append(mse_train.item()) global_train_step += 1 # log test/train mse epoch-wise to match the paper's figures writer.add_scalar('mse_train_epoch', np.mean(mse_train_list), epoch) writer.add_scalar('mse_test_epoch', np.mean(mse_test_list), epoch) if epoch % conf.checkpoint_freq == 0 and epoch > 0: torch.save(model.state_dict(), run_dir / f'model_ep{epoch}.pth') torch.save(model.state_dict(), run_dir / f'model.pth') return model
def main(): #torch.manual_seed(42) # ------------ # args # ------------ parser = ArgumentParser() # Learning parameters parser.add_argument('--auto_lr', type=U.str2bool, default=False,help="Auto lr finder") parser.add_argument('--learning_rate', type=float, default=10e-4) parser.add_argument('--scheduler', type=U.str2bool, default=False) parser.add_argument('--wd', type=float, default=2e-4) parser.add_argument('--moment', type=float, default=0.9) parser.add_argument('--batch_size', default=5, type=int) parser.add_argument('--n_epochs', default=10, type=int) parser.add_argument('--iter_every', default=1, type=int,help="Accumulate compute graph for iter_size step") parser.add_argument('--benchmark', default=False, type=U.str2bool, help="enable or disable backends.cudnn") # Model and eval parser.add_argument('--model', default='FCN', type=str,help="FCN or DLV3 model") parser.add_argument('--pretrained', default=False, type=U.str2bool,help="Use pretrained pytorch model") parser.add_argument('--eval_angle', default=True, type=U.str2bool,help=\ "If true, it'll eval the model with different angle input size") # Data augmentation parser.add_argument('--rotate', default=False, type=U.str2bool,help="Use random rotation as data augmentation") parser.add_argument('--pi_rotate', default=True, type=U.str2bool,help="Use only pi/2 rotation angle") parser.add_argument('--p_rotate', default=0.25, type=float,help="Probability of rotating the image during the training") parser.add_argument('--scale', default=True, type=U.str2bool,help="Use scale as data augmentation") parser.add_argument('--landcover', default=False, type=U.str2bool,\ help="Use Landcover dataset instead of VOC and COCO") parser.add_argument('--size_img', default=520, type=int,help="Size of input images") parser.add_argument('--size_crop', default=480, type=int,help="Size of crop image during training") parser.add_argument('--angle_max', default=360, type=int,help="Angle max for data augmentation") # Dataloader and gpu parser.add_argument('--nw', default=0, type=int,help="Num workers for the data loader") parser.add_argument('--pm', default=True, type=U.str2bool,help="Pin memory for the dataloader") parser.add_argument('--gpu', default=0, type=int,help="Wich gpu to select for training") # Datasets parser.add_argument('--split', default=False, type=U.str2bool, help="Split the dataset") parser.add_argument('--split_ratio', default=0.3, type=float, help="Amount of data we used for training") parser.add_argument('--dataroot_voc', default='/data/voc2012', type=str) parser.add_argument('--dataroot_sbd', default='/data/sbd', type=str) parser.add_argument('--dataroot_landcover', default='/share/DEEPLEARNING/datasets/landcover', type=str) # Save parameters parser.add_argument('--model_name', type=str,help="what name to use for saving") parser.add_argument('--save_dir', default='/data/save_model', type=str) parser.add_argument('--save_all_ep', default=False, type=U.str2bool,help=\ "If true it'll save the model every epoch in save_dir") parser.add_argument('--save_best', default=False, type=U.str2bool,help="If true will only save the best epoch model") args = parser.parse_args() # ------------ # device # ------------ device = torch.device("cuda:"+str(args.gpu) if torch.cuda.is_available() else "cpu") print("device used:",device) # ------------ # data # ------------ if args.size_img < args.size_crop: raise Exception('Cannot have size of input images less than size of crop') size_img = (args.size_img,args.size_img) size_crop = (args.size_crop,args.size_crop) if not args.landcover: train_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc,year='2012', image_set='train', \ download=True,rotate=args.rotate,size_img=size_img,size_crop=size_crop) test_dataset = mdset.VOCSegmentation(args.dataroot_voc,year='2012', image_set='val', download=True) train_dataset_SBD = mdset.SBDataset(args.dataroot_sbd, image_set='train_noval',mode='segmentation',\ rotate=args.rotate,size_img=size_img,size_crop=size_crop) #COCO dataset if args.extra_coco: extra_COCO = cu.get_coco(args.dataroot_coco,'train',rotate=args.rotate,size_img=size_img,size_crop=size_crop) # Concatene dataset train_dataset = tud.ConcatDataset([train_dataset_VOC,train_dataset_SBD,extra_COCO]) else: train_dataset = tud.ConcatDataset([train_dataset_VOC,train_dataset_SBD]) num_classes = 21 else: print('Loading Landscape Dataset') train_dataset = mdset.LandscapeDataset(args.dataroot_landcover,image_set="trainval",\ rotate=args.rotate,pi_rotate=args.pi_rotate,p_rotate=args.p_rotate,size_img=size_img,size_crop=size_crop,angle_max=args.angle_max) test_dataset = mdset.LandscapeDataset(args.dataroot_landcover,image_set="test") print('Success load Landscape Dataset') num_classes = 4 split = args.split if split==True: train_dataset = U.split_dataset(train_dataset,args.split_ratio) # Print len datasets print("There is",len(train_dataset),"images for training and",len(test_dataset),"for validation") dataloader_train = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size,num_workers=args.nw,\ pin_memory=args.pm,shuffle=True,drop_last=True)#,collate_fn=U.my_collate) dataloader_val = torch.utils.data.DataLoader(test_dataset,num_workers=args.nw,pin_memory=args.pm,\ batch_size=args.batch_size) # ------------ # model # ------------ if args.model.upper()=='FCN': model = models.segmentation.fcn_resnet101(pretrained=args.pretrained,num_classes=num_classes) elif args.model.upper()=='DLV3': model = models.segmentation.deeplabv3_resnet101(pretrained=args.pretrained,num_classes=num_classes) else: raise Exception('model must be "FCN" or "DLV3"') #model.to(device) # ------------ # save # ------------ save_dir = U.create_save_directory(args.save_dir) print('model will be saved in',save_dir) U.save_hparams(args,save_dir) # ------------ # training # ------------ # Auto lr finding print(args) criterion = nn.CrossEntropyLoss(ignore_index=num_classes) # On ignore la classe border. torch.autograd.set_detect_anomaly(True) optimizer = torch.optim.SGD(model.parameters(),lr=args.learning_rate,momentum=args.moment,weight_decay=args.wd) ev.train_fully_supervised(model=model,n_epochs=args.n_epochs,train_loader=dataloader_train,val_loader=dataloader_val,\ criterion=criterion,optimizer=optimizer,save_folder=save_dir,scheduler=args.scheduler,auto_lr=args.auto_lr,\ model_name=args.model_name,benchmark=args.benchmark, save_best=args.save_best,save_all_ep=args.save_all_ep,\ device=device,num_classes=num_classes)
import torch.nn as nn import torch.optim as optim from data_preprocessing import train_set, train_loader, val_loader from LeNet import LeNet from utils import view_bar from sklearn.metrics import accuracy_score from validation import validation import time # load the hyper-parameters logging.basicConfig(level=logging.INFO) logging.info("# Loading hyperparameters") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.train_dir) # identify the device to use DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.info("Using %s" % DEVICE) # instantiate the model, the loss function and optimizer model = LeNet() xentropy = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=hp.lr, momentum=0.9) # check the latest checkpoint to resume training ckpt_path = latest_ckpt(hp.ckpt) if ckpt_path is None:
def main(args): print("\nParameters:") for attr, value in sorted(vars(args).items()): print("{}={}".format(attr.upper(), value)) print("") # Selecting wihch GPU to use os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_list args.cuda = torch.cuda.is_available() and not args.no_cuda # Output directory for models and summaries out_dir = os.path.join(args.log, args.exp_name) if not os.path.exists(out_dir): os.makedirs(out_dir) print('Writing to {}\n'.format(out_dir)) save_hparams(args, os.path.join(out_dir, 'hparams')) # Checkpoint directory checkpoint_dir = os.path.join(out_dir, 'checkpoints') checkpoint_prefix = os.path.join(checkpoint_dir, 'model') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # Build dataset time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print("Create training dataset begain... | %s " % time_str) test_seen_dataset = KGDataset(args.test_seen_file, max_knowledge=999) test_unseen_dataset = KGDataset(args.test_unseen_file, max_knowledge=999) test_seen_loader = get_batch_loader(test_seen_dataset, collate_fn=collate_fn, batch_size=args.eval_batch_size, is_test=True) test_unseen_loader = get_batch_loader(test_unseen_dataset, collate_fn=collate_fn, batch_size=args.eval_batch_size, is_test=True) time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print("Create training dataset end... | %s " % time_str) # Batcher dis_batcher = DisBatcher(args.bert_truncate, args.bert_config, args.cuda) gen_batcher = GenBatcher(args.knowledge_truncate, args.text_truncate, args.gpt2_truncate, args.gpt2_config, args.cuda) # Load model dis_model = load_dis_net(args.emb_dim, args.lstm_hidden, args.lstm_layer, args.bert_config, args.dis_pretrain_file, args.load_dis, args.cuda) gen_model = load_gen_net(gen_batcher.tokenizer, args.segment, args.gpt2_config, args.gen_pretrain_file, args.load_gen, args.cuda) ce = lambda logit, target: F.cross_entropy(logit, target, reduce=False) gen_criterion = lambda logits, targets: sequence_loss( logits, targets, ce, pad_idx=-1) def dev_step(split, global_step): if split == 'test_seen': test_loader = test_seen_loader elif split == 'test_unseen': test_loader = test_unseen_loader else: raise ValueError dis_model.eval() gen_model.eval() n_token, test_loss = 0, 0.0 # ppl test_hyp, test_ref = [], [] count = 0 with torch.no_grad(): for knowledges, histories, users, responses, knowledge_lens in test_loader: knowledges = [know.split('\n\n') for know in knowledges] histories = [his.split('\n\n') for his in histories] dis_args = dis_batcher(knowledges, histories, knowledge_lens, args.n_sent) dis_out = dis_model(*dis_args) dis_knowledges = [[knowledges[bi][dis_out[0][bi].item()]] for bi in range(len(knowledges))] gen_args = gen_batcher(dis_knowledges, histories, users, responses, args.segment, True) loss = gen_criterion( gen_model(gen_args[0], token_type_ids=gen_args[1])[0], gen_args[2]) n_token += loss.size(0) test_loss += loss.sum().item() for bi in range(len(dis_knowledges)): dec_in = gen_batcher(dis_knowledges[bi:bi + 1], histories[bi:bi + 1], users[bi:bi + 1], segment=args.segment, training=False) dec_out = gen_model.batch_decode( dec_in, args.max_length, args.min_length, args.early_stopping, args.beam_size, args.repetition_penalty, gen_batcher.eos_id, args.length_penalty, args.no_repeat_ngram_size) dec_out = dec_out[0].tolist()[dec_in.size(1):] _hyp = gen_batcher.tokenizer.decode( dec_out, skip_special_tokens=True, clean_up_tokenization_spaces=False) _ref = responses[bi] test_hyp.append(_hyp) test_ref.append(_ref) count += 1 if count % 1000 == 0: print(count) with open( os.path.join( out_dir, '{}-decoded-iter-{}.txt'.format(split, global_step)), 'w') as f: for _hyp, _ref in zip(test_hyp, test_ref): f.writelines('{} ||| {}\n'.format(_hyp, _ref)) MeanLoss = test_loss / n_token b1, b2, b3, b4 = bleu_metric(test_hyp, test_ref) d1, d2 = distinct_metric(test_hyp) f1 = f1_metric(test_hyp, test_ref) time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print("**********************************") print("{} results..........".format(split)) print('hypothesis: ', len(test_hyp)) print("Step: %d \t| ppl: %.3f \t| %s" % (global_step, math.exp(MeanLoss), time_str)) print("BLEU-1/2/3/4: {:.4f}/{:.4f}/{:.4f}/{:.4f}".format( b1, b2, b3, b4)) print("Distinct-1/2: {:.4f}/{:.4f}".format(d1, d2)) print("F1: {:.4f}".format(f1)) print("**********************************") return { 'f1': f1, 'loss': MeanLoss, 'bleu1': b1, 'bleu2': b2, 'bleu3': b3, 'bleu4': b4, 'distinct1': d1, 'distinct2': d2 } dev_step("test_seen", 0) # test_random_split dev_step("test_unseen", 0) # test_topic_split
from bert_transformer_vae_for_PAGE import VaeModel from data_load import get_batch_for_train_or_dev_or_test,saveForTfRecord from utils import save_hparams, get_hypotheses import os from hparams import Hparams import logging os.environ['CUDA_VISIBLE_DEVICES']= '5' logging.basicConfig(level=logging.INFO) logging.info("# hparams") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.PAGEdir) logging.info("# 许海明提醒你: 这里需要准备tfRecord") logging.info("# 许海明提醒你: 这里需要准备tfRecord") logging.info("# 许海明提醒你: 这里需要准备tfRecord") logging.info("# 许海明提醒你: 这里需要准备tfRecord") saveForTfRecord(hp.test, hp.maxlen_vae_Encoder, hp.maxlen_vae_Decoder_en, hp.maxlen_vae_Decoder_de, hp.vocab, output_file="./data/PAGE/test.tf_record",
def train(hp): save_hparams(hp, hp.checkpoints_dir) # Data generator logging.info("Prepare Train/Eval batches...") train_batches, num_train_batches, num_train_samples = get_batch( hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1, hp.eval2, 10000, 10000, hp.vocab, hp.batch_size, shuffle=False) # Batch iterator iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next() train_init_op = iter.make_initializer(train_batches) eval_init_op = iter.make_initializer(eval_batches) # Build model logging.info("Build model...") model = Transformer(hp) logging.info("Model is built!") # Session logging.info("Session initialize") saver = tf.train.Saver(max_to_keep=5) with tf.Session() as sess: # Check & Load latest version model checkpoint ckpt = tf.train.latest_checkpoint(hp.checkpoints_dir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join(hp.checkpoints_dir, "specs")) else: saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(hp.checkpoints_dir, sess.graph) sess.run(train_init_op) total_steps = hp.num_epochs * num_train_batches _gs = sess.run(model.global_step) k = 5 min_dev_loss = 0 stop_alpha = 20.0 eval_losses = [] # Start training for i in tqdm(range(_gs, total_steps + 1)): _input_x, _decoder_input, _target = sess.run([xs[0], ys[0], ys[1]]) _, _gs, _summary = sess.run( [model.train_op, model.global_step, model.summaries], feed_dict={ model.input_x: _input_x, model.decoder_input: _decoder_input, model.target: _target, model.is_training: True }) epoch = math.ceil(_gs / num_train_batches) summary_writer.add_summary(_summary, _gs) # Evaluation if _gs and _gs % num_train_batches == 0: logging.info("Epoch {} is done".format(epoch)) _loss = sess.run(model.loss, feed_dict={ model.input_x: _input_x, model.decoder_input: _decoder_input, model.target: _target, model.is_training: False }) # evaluation y_hat, mean_loss = model.eval(sess, eval_init_op, xs, ys, num_eval_batches) # id to token logging.info("# Get hypotheses") hypotheses = get_hypotheses(num_eval_samples, y_hat, model.idx2token) # save translation results if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir) logging.info("# Write results") model_output = "translation_E{:02d}L{:.2f}EL{:.2f}".format( epoch, _loss, mean_loss) translation = os.path.join(hp.evaldir, model_output) with open(translation, 'w', encoding="utf-8") as fout: fout.write("\n".join(hypotheses)) logging.info( "# Calculate bleu score and append it to translation") # bleu calc_bleu_nltk(hp.eval2, translation) # save model logging.info("# Save models") ckpt_name = os.path.join(hp.checkpoints_dir, model_output) saver.save(sess, ckpt_name, global_step=_gs) logging.info( "After training of {} epochs, {} has been saved.".format( epoch, ckpt_name)) # claculate early stop if len(eval_losses) == 0: min_dev_loss = mean_loss eval_losses.append(mean_loss) gl, p_k, pq_alpha = calculate_earlystop_baseline( mean_loss, min_dev_loss, eval_losses, k) min_dev_loss = mean_loss if mean_loss < min_dev_loss else min_dev_loss eval_losses = eval_losses[-k:] logging.info( "GL(t): {:.4f}, P_k: {:.4f}, PQ_alpha: {:.4f}".format( gl, p_k, pq_alpha)) if gl > stop_alpha: logging.info( "No optimization for a long time, auto-stopping...") break # change data iterator back to train iterator sess.run(train_init_op) summary_writer.close() logging.info("Done")
from model import Transformer from tqdm import tqdm from data_load import get_batch from utils import save_hparams, save_variable_specs, get_hypotheses, calc_bleu import os from hparams import Hparams import math import logging logging.basicConfig(level=logging.INFO) logging.info("# hparams") hparams = Hparams() # 超参数 parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.logdir) # 超参数写入日志 logging.info("# Prepare train/eval batches") train_batches, num_train_batches, num_train_samples = get_batch(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1, hp.eval2, 100000, 100000, hp.vocab, hp.batch_size,
def main(): #torch.manual_seed(42) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument('--auto_lr', type=U.str2bool, default=False, help="Auto lr finder") parser.add_argument('--learning_rate', type=float, default=10e-4) parser.add_argument('--Loss', type=str, default='KL') parser.add_argument('--gamma', type=float, default=0.5, help="gamma balance the two losses") parser.add_argument('--scheduler', type=U.str2bool, default=True) parser.add_argument('--wd', type=float, default=2e-4) parser.add_argument('--moment', type=float, default=0.9) parser.add_argument('--batch_size', default=5, type=int) parser.add_argument('--iter_every', default=1, type=int, help="Accumulate compute graph for iter_size step") parser.add_argument('--n_epochs', default=10, type=int) parser.add_argument('--model', default='DLV3', type=str, help="FCN or DLV3 model") parser.add_argument('--pretrained', default=False, type=U.str2bool, help="Use pretrained pytorch model") parser.add_argument('--eval_angle', default=True, type=U.str2bool,help=\ "If true, it'll eval the model with different angle input size") parser.add_argument('--eval_every', default=30, type=int, help="Eval all input rotation angle every n step") parser.add_argument('--rotate', default=False, type=U.str2bool, help="Use random rotation as data augmentation") parser.add_argument('--angle_max', default=30, type=int, help="Max angle rotation of input image") parser.add_argument('--size_img', default=520, type=int, help="Size of input images") parser.add_argument('--size_crop', default=480, type=int, help="Size of crop image during training") parser.add_argument('--nw', default=0, type=int, help="Num workers for the data loader") parser.add_argument('--pm', default=True, type=U.str2bool, help="Pin memory for the dataloader") parser.add_argument('--gpu', default=0, type=int, help="Wich gpu to select for training") parser.add_argument( '--rot_cpu', default=False, type=U.str2bool, help="Apply rotation on the cpu (Help to use less gpu memory)") parser.add_argument('--benchmark', default=False, type=U.str2bool, help="enable or disable backends.cudnn") parser.add_argument('--split', default=True, type=U.str2bool, help="Split the dataset") parser.add_argument('--split_ratio', default=0.3, type=float, help="Amount of data we used for training") parser.add_argument('--extra_coco', default=False, type=U.str2bool,\ help="Use coco dataset as extra annotation for fully supervised training") parser.add_argument( '--multi_task', default=False, type=U.str2bool, help="Multi task training (same data for equiv and sup)") parser.add_argument('--dataroot_voc', default='/share/DEEPLEARNING/datasets/voc2012', type=str) parser.add_argument('--dataroot_sbd', default='/share/DEEPLEARNING/datasets/sbd', type=str) parser.add_argument('--dataroot_coco', default='/share/DEEPLEARNING/datasets/coco', type=str) parser.add_argument('--model_name', type=str, help="what name to use for saving") parser.add_argument('--save_dir', default='/data/save_model', type=str) parser.add_argument('--save_all_ep', default=False, type=U.str2bool,help=\ "If true it'll save the model every epoch in save_dir") parser.add_argument('--save_best', default=False, type=U.str2bool, help="If true will only save the best epoch model") parser.add_argument('--load_last_model', default=False, type=U.str2bool, help="If it will load the last model saved with\ This parameters." ) args = parser.parse_args() # ------------ # device # ------------ device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") print("device used:", device) # ------------ # model # ------------ # ------------ # data # ------------ if args.size_img < args.size_crop: raise Exception( 'Cannot have size of input images less than size of crop') size_img = (args.size_img, args.size_img) size_crop = (args.size_crop, args.size_crop) train_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc,year='2012', image_set='train', \ download=True,rotate=args.rotate,size_img=size_img,size_crop=size_crop) val_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc, year='2012', image_set='val', download=True) train_dataset_SBD = mdset.SBDataset(args.dataroot_sbd, image_set='train_noval',mode='segmentation',\ rotate=args.rotate,size_img=size_img,size_crop=size_crop) #COCO dataset if args.extra_coco: extra_COCO = cu.get_coco(args.dataroot_coco, 'train', rotate=args.rotate, size_img=size_img, size_crop=size_crop) # Concatene dataset train_dataset_unsup = tud.ConcatDataset( [train_dataset_VOC, train_dataset_SBD]) # Split dataset split = args.split if split == True: train_dataset_sup = U.split_dataset(train_dataset_unsup, args.split_ratio) else: train_dataset_sup = train_dataset_unsup # Multi task ? if args.multi_task: train_dataset_unsup = train_dataset_sup # If extra coco concatene all dataset for unsupervised training if args.extra_coco: train_dataset_unsup = tud.ConcatDataset( [train_dataset_VOC, train_dataset_SBD, extra_COCO]) # Print len datasets print("There is",len(train_dataset_sup),"images for supervised training",len(train_dataset_unsup),\ "for equivariance loss and",len(val_dataset_VOC),"for validation") dataloader_train_sup = torch.utils.data.DataLoader(train_dataset_sup, batch_size=args.batch_size,num_workers=args.nw,\ pin_memory=args.pm,shuffle=True,drop_last=True) dataloader_val = torch.utils.data.DataLoader(val_dataset_VOC,num_workers=args.nw,pin_memory=args.pm,\ batch_size=args.batch_size) # --------- # Load model # --------- if args.load_last_model: model,save_dir = fbm.load_best_model(save_dir=args.save_dir,model_name=args.model_name,split=args.split,\ split_ratio=args.split_ratio,batch_size =args.batch_size,rotate=args.rotate) print("Training will continue from this file.", save_dir) else: save_dir = U.create_save_directory( args.save_dir) # Create a new save directory if args.model.upper() == 'FCN': model = models.segmentation.fcn_resnet101( pretrained=args.pretrained) elif args.model.upper() == 'DLV3': print('DEEPLAB MODEL') model = models.segmentation.deeplabv3_resnet101( pretrained=args.pretrained) else: raise Exception('model must be "FCN" or "DLV3"') model.to(device) # ------------ # save # ------------ print('model will be saved in', save_dir) U.save_hparams(args, save_dir) # ------------ # training # ------------ # Auto lr finding #if args.auto_lr==True: criterion_supervised = nn.CrossEntropyLoss( ignore_index=21) # On ignore la classe border. optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.moment, weight_decay=args.wd) ev.train_rot_equiv(model,args.n_epochs,dataloader_train_sup,train_dataset_unsup,dataloader_val,criterion_supervised,optimizer,\ scheduler=args.scheduler,Loss=args.Loss,gamma=args.gamma,batch_size=args.batch_size,iter_every=args.iter_every,save_folder=save_dir,\ model_name=args.model_name,benchmark=args.benchmark,angle_max=args.angle_max,size_img=args.size_img,\ eval_every=args.eval_every,save_all_ep=args.save_all_ep,dataroot_voc=args.dataroot_voc,save_best=args.save_best\ ,rot_cpu=args.rot_cpu,device=device) # Final evaluation """
def train_template(class_model, shuffle=True, save_model=True): # 大数据集耗时请关掉shuffle,调参请关掉save_model logger = logging.getLogger() logger.setLevel(logging.INFO) logging.info("# hparams") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() run_type = hp.run_type logdir = hp.logdir batch_size = hp.batch_size num_epochs = hp.num_epochs task_type = hp.task_type assert hp.run_type in ("new", "continue", "finetune") if "continue" == hp.run_type: load_hparams(hp, logdir) batch_size = hp.batch_size if task_type is not None: assert task_type == hp.task_type task_type = hp.task_type assert task_type is not None context = Context(hp) logging.info("# Prepare train/eval batches") logging.info("Use %s for training set", hp.train_data) logging.info("Use %s for evaluation set", hp.eval_data) eval_batches, num_eval_batches, num_eval_samples = get_batch( fpath=hp.eval_data, task_type=task_type, input_indices=context.input_indices, vocabs=context.vocabs, context=context, batch_size=batch_size, shuffle=False) train_batches, num_train_batches, num_train_samples = get_batch( fpath=hp.train_data, task_type=task_type, input_indices=context.input_indices, vocabs=context.vocabs, context=context, batch_size=batch_size, shuffle=shuffle) # create a iterator of the correct shape and type iterr = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) inputs_and_target = iterr.get_next() # 照抄即可,目前不是很熟悉这些接口 train_init_op = iterr.make_initializer(train_batches) eval_init_op = iterr.make_initializer(eval_batches) model = class_model(context) loss, train_op, global_step, train_summaries = model.train( inputs=inputs_and_target[:-1], targets=inputs_and_target[-1]) eval_ouputs, eval_summaries = model.eval(inputs=inputs_and_target[:-1], targets=inputs_and_target[-1]) inference_name = model.get_inference_op_name() logging.info("inference_node_name:%s" % inference_name) logging.info("# Session") saver = tf.train.Saver(max_to_keep=num_epochs) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: time_sess = time.time() ckpt = tf.train.latest_checkpoint(logdir) if ckpt is None or "new" == run_type: # 新建 save_hparams(hp, logdir) logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) else: # continue OR finetune saver.restore(sess, ckpt) if "finetune" == hp.run_type: # finetune save_hparams(hp, logdir) save_variable_specs(os.path.join(logdir, "var_specs")) save_operation_specs(os.path.join(logdir, "op_specs")) f_debug = open(os.path.join(logdir, "debug.txt"), "a") summary_writer = tf.summary.FileWriter(logdir, sess.graph) if hp.zero_step: sess.run(global_step.assign(0)) sess.run(train_init_op) total_steps = num_epochs * num_train_batches logging.info("total_steps:%s, num_epochs:%s, num_train_batches:%s", total_steps, num_epochs, num_train_batches) _gs = sess.run(global_step) logging.info("global_step is stated at %s", _gs) t_epoch = time.time() model_output = 'default' for i in tqdm(range(_gs, total_steps + 1)): ts = time.time() # f_debug.write("loss\n") # tensor_tmp = tf.get_default_graph().get_tensor_by_name("loss:0") # np.savetxt(f_debug, tensor_tmp.eval().reshape([1]), delimiter=', ', footer="=" * 64) _, _gs, _summary = sess.run( [train_op, global_step, train_summaries]) epoch = math.ceil(_gs / num_train_batches) f_debug.write("train: epoch %s takes %s\n" % (epoch, time.time() - ts)) summary_writer.add_summary(_summary, _gs) if _gs and _gs % num_train_batches == 0: logging.info("epoch {} is done".format(epoch)) # train loss _loss = sess.run(loss) # eval logging.info("# eval evaluation") _, _eval_summaries = sess.run([eval_init_op, eval_summaries]) summary_writer.add_summary(_eval_summaries, _gs) if save_model: # save checkpoint logging.info("# save models") model_output = "model%02dL%.2f" % (epoch, _loss) ckpt_name = os.path.join(logdir, model_output) saver.save(sess, ckpt_name, global_step=_gs) logging.info( "after training of {} epochs, {} has been saved.". format(epoch, ckpt_name)) # proceed to next epoch logging.info("# fall back to train mode") ts = time.time() sess.run(train_init_op) logging.info("fallback_train: %s\t%s\t%s takes %s" % (i, _gs, epoch, time.time() - ts)) logging.info("epoch %s takes %s", epoch, time.time() - t_epoch) t_epoch = time.time() summary_writer.close() logging.info("Session runs for %s", time.time() - time_sess) if save_model: # save to pb inference_node_name = inference_name[:inference_name.find(":")] graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, output_node_names=[inference_node_name]) tf.train.write_graph(graph_def, logdir, '%s.pb' % model_output, as_text=False) f_debug.close() logging.info("Done")
from tqdm import tqdm from data_load import get_batch from utils import save_hparams, save_variable_specs, get_hypotheses, calc_bleu import os from hparams import Hparams import math import logging tf.device('/gpu:3') logging.basicConfig(level=logging.INFO) logging.info("# hparams") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.logdir) #保存超参数设置 logging.info("# Prepare train/eval batches") train_batches, num_train_batches, num_train_samples = get_batch(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1, hp.eval2, 100000, 100000, hp.vocab, hp.batch_size,
def main(): #torch.manual_seed(42) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument('--auto_lr', type=U.str2bool, default=False, help="Auto lr finder") parser.add_argument('--learning_rate', type=float, default=10e-4) parser.add_argument('--scheduler', type=U.str2bool, default=False) parser.add_argument('--wd', type=float, default=2e-4) parser.add_argument('--moment', type=float, default=0.9) parser.add_argument('--batch_size', default=5, type=int) parser.add_argument('--n_epochs', default=10, type=int) parser.add_argument('--model', default='FCN', type=str, help="FCN or DLV3 model") parser.add_argument('--pretrained', default=False, type=U.str2bool, help="Use pretrained pytorch model") parser.add_argument('--eval_angle', default=True, type=U.str2bool,help=\ "If true, it'll eval the model with different angle input size") parser.add_argument('--rotate', default=False, type=U.str2bool, help="Use random rotation as data augmentation") parser.add_argument('--scale', default=True, type=U.str2bool, help="Use scale as data augmentation") parser.add_argument('--size_img', default=520, type=int, help="Size of input images") parser.add_argument('--size_crop', default=480, type=int, help="Size of crop image during training") parser.add_argument('--nw', default=0, type=int, help="Num workers for the data loader") parser.add_argument('--pm', default=True, type=U.str2bool, help="Pin memory for the dataloader") parser.add_argument('--gpu', default=0, type=int, help="Wich gpu to select for training") parser.add_argument('--benchmark', default=False, type=U.str2bool, help="enable or disable backends.cudnn") parser.add_argument('--split', default=False, type=U.str2bool, help="Split the dataset") parser.add_argument('--split_ratio', default=0.3, type=float, help="Amount of data we used for training") parser.add_argument('--dataroot_voc', default='/share/DEEPLEARNING/datasets/voc2012/', type=str) parser.add_argument('--dataroot_sbd', default='/share/DEEPLEARNING/datasets/sbd/', type=str) parser.add_argument('--model_name', type=str, help="what name to use for saving") parser.add_argument('--save_dir', default='/data/save_model', type=str) parser.add_argument('--save_all_ep', default=False, type=U.str2bool,help=\ "If true it'll save the model every epoch in save_dir") parser.add_argument('--save_best', default=False, type=U.str2bool, help="If true will only save the best epoch model") args = parser.parse_args() # ------------ # save # ------------ save_dir = U.create_save_directory(args.save_dir) print('model will be saved in', save_dir) U.save_hparams(args, save_dir) # ------------ # device # ------------ device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") print("device used:", device) # ------------ # model # ------------ if args.model.upper() == 'FCN': model = models.segmentation.fcn_resnet101(pretrained=args.pretrained) elif args.model.upper() == 'DLV3': model = models.segmentation.deeplabv3_resnet101( pretrained=args.pretrained) else: raise Exception('model must be "FCN" or "DLV3"') model.to(device) # ------------ # data # ------------ if args.size_img < args.size_crop: raise Exception( 'Cannot have size of input images less than size of crop') size_img = (args.size_img, args.size_img) size_crop = (args.size_crop, args.size_crop) train_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc,year='2012', image_set='train', \ download=True,rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop) val_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc, year='2012', image_set='val', download=True) train_dataset_SBD = mdset.SBDataset(args.dataroot_sbd, image_set='train_noval',mode='segmentation',\ rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop) # Concatene dataset train_dataset = tud.ConcatDataset([train_dataset_VOC, train_dataset_SBD]) split = args.split if split == True: train_dataset = U.split_dataset(train_dataset, args.split_ratio) # Print len datasets print("There is", len(train_dataset), "images for training and", len(val_dataset_VOC), "for validation") dataloader_train = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size,num_workers=args.nw,\ pin_memory=args.pm,shuffle=True,drop_last=True)#,collate_fn=U.my_collate) dataloader_val = torch.utils.data.DataLoader(val_dataset_VOC,num_workers=args.nw,pin_memory=args.pm,\ batch_size=args.batch_size) # Decide which device we want to run on # ------------ # training # ------------ # Auto lr finding #if args.auto_lr==True: criterion = nn.CrossEntropyLoss( ignore_index=21) # On ignore la classe border. optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.moment, weight_decay=args.wd) ev.train_fully_supervised(model=model,n_epochs=args.n_epochs,train_loader=dataloader_train,val_loader=dataloader_val,\ criterion=criterion,optimizer=optimizer,save_folder=save_dir,scheduler=args.scheduler,model_name=args.model_name,\ benchmark=args.benchmark, save_best=args.save_best,save_all_ep=args.save_all_ep,device=device,num_classes=21) # Final evaluation if args.eval_angle: d_iou = ev.eval_model_all_angle(model, args.size_img, args.dataroot_voc, train=True, device=device) U.save_eval_angle(d_iou, save_dir) d_iou = ev.eval_model_all_angle(model, args.size_img, args.dataroot_voc, train=False, device=device) U.save_eval_angle(d_iou, save_dir)
def __init__(self): hparams = Hparams() parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.logdir) self.hp = hp
def run_training(args): it_network = ImageTransformNet( input_shape=hparams['input_size'], residual_layers=hparams['residual_layers'], residual_filters=hparams['residual_filters'], initializer=hparams['initializer']) loss_network = LossNetwork(hparams['style_layers']) optimizer = tf.keras.optimizers.Adam( learning_rate=hparams['learning_rate']) optimizer = mixed_precision.LossScaleOptimizer(optimizer) ckpt_dir = os.path.join(args.name, 'pretrained') ckpt = tf.train.Checkpoint(network=it_network, optimizer=optimizer, step=tf.Variable(0)) ckpt_manager = tf.train.CheckpointManager( ckpt, directory=ckpt_dir, max_to_keep=args.max_ckpt_to_keep) ckpt.restore(ckpt_manager.latest_checkpoint) log_dir = os.path.join(args.name, 'log_dir') writer = tf.summary.create_file_writer(logdir=log_dir) print('\n####################################################') print('Perceptual Losses for Real-Time Style Transfer Train') print('####################################################\n') if ckpt_manager.latest_checkpoint: print('Restored {} from: {}'.format(args.name, ckpt_manager.latest_checkpoint)) else: print('Initializing {} from scratch'.format(args.name)) save_hparams(args.name) print('Style image: {}'.format(args.style_img)) print('Start TensorBoard with: $ tensorboard --logdir ./\n') total_loss_avg = tf.keras.metrics.Mean() style_loss_avg = tf.keras.metrics.Mean() content_loss_avg = tf.keras.metrics.Mean() save_hparams(args.name) style_img = convert(args.style_img) target_feature_maps = loss_network(style_img[tf.newaxis, :]) target_gram_matrices = [gram_matrix(x) for x in target_feature_maps] num_style_layers = len(target_feature_maps) dataset = create_ds(args) test_content_batch = create_test_batch(args) @tf.function def test_step(batch): prediction = it_network(batch, training=False) #prediction_norm = np.array(tf.clip_by_value(prediction, 0, 1)*255, dtype=np.uint8) # Poor quality, no convergence #prediction_norm = np.array(tf.clip_by_value(prediction, 0, 255), dtype=np.uint8) return deprocess(prediction) @tf.function def train_step(batch): with tf.GradientTape() as tape: output_batch = it_network(batch, training=True) output_batch = 255 * (output_batch + 1.0) / 2.0 # float deprocess # Feed target and output batch through loss_network target_batch_feature_maps = loss_network(batch) output_batch_feature_maps = loss_network(output_batch) c_loss = content_loss( target_batch_feature_maps[hparams['content_layer_index']], output_batch_feature_maps[hparams['content_layer_index']]) c_loss *= hparams['content_weight'] # Get output gram_matrix output_gram_matrices = [ gram_matrix(x) for x in output_batch_feature_maps ] s_loss = style_loss(target_gram_matrices, output_gram_matrices) s_loss *= hparams['style_weight'] / num_style_layers total_loss = c_loss + s_loss scaled_loss = optimizer.get_scaled_loss(total_loss) scaled_gradients = tape.gradient(scaled_loss, it_network.trainable_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) #gradients = tape.gradient(total_loss, it_network.trainable_variables) optimizer.apply_gradients( zip(gradients, it_network.trainable_variables)) total_loss_avg(total_loss) content_loss_avg(c_loss) style_loss_avg(s_loss) total_start = time.time() for batch_image in dataset: start = time.time() train_step(batch_image) ckpt.step.assign_add(1) step_int = int(ckpt.step) # cast ckpt.step if (step_int) % args.ckpt_interval == 0: print('Time taken for step {} is {} sec'.format( step_int, time.time() - start)) ckpt_manager.save(step_int) prediction_norm = test_step(test_content_batch) with writer.as_default(): tf.summary.scalar('total loss', total_loss_avg.result(), step=step_int) tf.summary.scalar('content loss', content_loss_avg.result(), step=step_int) tf.summary.scalar('style loss', style_loss_avg.result(), step=step_int) images = np.reshape(prediction_norm, (-1, hparams['input_size'][0], hparams['input_size'][1], 3)) tf.summary.image('generated image', images, step=step_int, max_outputs=len(test_content_batch)) print('Total loss: {:.4f}'.format(total_loss_avg.result())) print('Content loss: {:.4f}'.format(content_loss_avg.result())) print('Style loss: {:.4f}'.format(style_loss_avg.result())) print('Total time: {} sec\n'.format(time.time() - total_start)) total_loss_avg.reset_states() content_loss_avg.reset_states() style_loss_avg.reset_states()
x1, x2, score = iterr.get_next() # 照抄即可,目前不是很熟悉这些接口 train_init_op = iterr.make_initializer(train_batches) model = DSSM(context) loss, train_op, global_step, train_summaries = model.train(x1, x2, score) logging.info("# Session") saver = tf.train.Saver(max_to_keep=num_epochs) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: time_sess = time.time() ckpt = tf.train.latest_checkpoint(logdir) if ckpt is None or "new" == run_type: # 新建 save_hparams(hp, logdir) logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) else: # continue OR finetune saver.restore(sess, ckpt) if "finetune" == hp.run_type: # finetune save_hparams(hp, logdir) else: # continue batch_size = hp.batch_size save_variable_specs(os.path.join(logdir, "var_specs")) save_operation_specs(os.path.join(logdir, "op_specs")) f_debug = open(os.path.join(logdir, "debug.txt"), "a") summary_writer = tf.summary.FileWriter(logdir, sess.graph) if hp.zero_step: sess.run(global_step.assign(0))
from tqdm import tqdm from data_load import get_batch from utils import save_hparams, save_variable_specs, get_hypotheses, calc_bleu import os from hparams import Hparams import math import logging logging.basicConfig(level=logging.INFO) os.environ['CUDA_VISIBLE_DEVICES'] = "5" logging.info("# hparams") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.logdir) logging.info("# Prepare train/eval batches") train_batches, num_train_batches, num_train_samples = get_batch(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1, hp.eval2, 100000, 100000, hp.vocab, hp.batch_size,
def main(): #torch.manual_seed(42) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument('--auto_lr', type=U.str2bool, default=False, help="Auto lr finder") parser.add_argument('--learning_rate', type=float, default=10e-4) parser.add_argument('--Loss', type=str, default='KL') parser.add_argument('--gamma', type=float, default=0.5, help="gamma balance the two losses") parser.add_argument('--scheduler', type=U.str2bool, default=False) parser.add_argument('--wd', type=float, default=2e-4) parser.add_argument('--moment', type=float, default=0.9) parser.add_argument('--batch_size', default=5, type=int) parser.add_argument('--n_epochs', default=10, type=int) parser.add_argument('--model', default='FCN', type=str, help="FCN or DLV3 model") parser.add_argument('--pretrained', default=False, type=U.str2bool, help="Use pretrained pytorch model") parser.add_argument('--eval_every', default=30, type=int, help="Eval all input rotation angle every n step") parser.add_argument('--rotate', default=False, type=U.str2bool, help="Use random rotation as data augmentation") parser.add_argument('--scale', default=True, type=U.str2bool, help="Use scale as data augmentation") parser.add_argument('--scale_factor', default=30, type=float, nargs='+', help="Scale image between min*size - max*size") parser.add_argument('--size_img', default=520, type=int, help="Size of input images") parser.add_argument('--size_crop', default=480, type=int, help="Size of crop image during training") parser.add_argument('--nw', default=0, type=int, help="Num workers for the data loader") parser.add_argument('--pm', default=True, type=U.str2bool, help="Pin memory for the dataloader") parser.add_argument('--gpu', default=0, type=int, help="Wich gpu to select for training") parser.add_argument('--benchmark', default=False, type=U.str2bool, help="enable or disable backends.cudnn") parser.add_argument('--split', default=True, type=U.str2bool, help="Split the dataset") parser.add_argument('--split_ratio', default=0.3, type=float, help="Amount of data we used for training") parser.add_argument( '--multi_task', default=False, type=U.str2bool, help="Multi task training (same data for equiv and sup)") parser.add_argument('--dataroot_voc', default='/data/voc2012', type=str) parser.add_argument('--dataroot_sbd', default='/data/sbd', type=str) parser.add_argument('--model_name', type=str, help="what name to use for saving") parser.add_argument('--save_dir', default='/data/save_model', type=str) parser.add_argument('--save_all_ep', default=False, type=U.str2bool,help=\ "If true it'll save the model every epoch in save_dir") parser.add_argument('--save_best', default=False, type=U.str2bool, help="If true will only save the best epoch model") args = parser.parse_args() # ------------ # save # ------------ save_dir = U.create_save_directory(args.save_dir) print('model will be saved in', save_dir) U.save_hparams(args, save_dir) # ------------ # device # ------------ device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") print("device used:", device) # ------------ # model # ------------ if args.model.upper() == 'FCN': model = models.segmentation.fcn_resnet101(pretrained=args.pretrained) elif args.model.upper() == 'DLV3': model = models.segmentation.deeplabv3_resnet101( pretrained=args.pretrained) else: raise Exception('model must be "FCN" or "DLV3"') model.to(device) # ------------ # data # ------------ if args.size_img < args.size_crop: raise Exception( 'Cannot have size of input images less than size of crop') size_img = (args.size_img, args.size_img) size_crop = (args.size_crop, args.size_crop) train_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc,year='2012', image_set='train', \ download=True,rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop) val_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc, year='2012', image_set='val', download=True) train_dataset_SBD = mdset.SBDataset(args.dataroot_sbd, image_set='train_noval',mode='segmentation',\ rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop) # Concatene dataset train_dataset_unsup = tud.ConcatDataset( [train_dataset_VOC, train_dataset_SBD]) # Split dataset split = args.split if split == True: train_dataset_sup = U.split_dataset(train_dataset_unsup, args.split_ratio) # Multi task ? if args.multi_task: train_dataset_unsup = train_dataset_sup # Print len datasets print("There is",len(train_dataset_sup),"images for supervised training",len(train_dataset_unsup),\ "for equivariance loss and",len(val_dataset_VOC),"for validation") dataloader_train_sup = torch.utils.data.DataLoader(train_dataset_sup, batch_size=args.batch_size,num_workers=args.nw,\ pin_memory=args.pm,shuffle=True,drop_last=True) dataloader_val = torch.utils.data.DataLoader(val_dataset_VOC,num_workers=args.nw,pin_memory=args.pm,\ batch_size=args.batch_size) # Decide which device we want to run on # ------------ # training # ------------ # Auto lr finding #if args.auto_lr==True: scale_factor = (0.2, 0.8) criterion_supervised = nn.CrossEntropyLoss( ignore_index=21) # On ignore la classe border. optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.moment, weight_decay=args.wd) ev.train_scale_equiv(model,args.n_epochs,dataloader_train_sup,train_dataset_unsup,dataloader_val,criterion_supervised,optimizer,\ scheduler=args.scheduler,Loss=args.Loss,gamma=args.gamma,batch_size=args.batch_size,save_folder=save_dir,\ model_name=args.model_name,benchmark=args.benchmark,scale_factor = scale_factor,\ size_img=args.size_img,save_all_ep=args.save_all_ep,dataroot_voc=args.dataroot_voc,\ save_best=args.save_best,device=device)