parser.add_argument('--single_gpu', default=False, action='store_true', help='use single GPU') args = parser.parse_args() if args.nhidlast < 0: args.nhidlast = args.emsize if args.dropoutl < 0: args.dropoutl = args.dropouth if args.small_batch_size < 0: args.small_batch_size = args.batch_size if not args.continue_train: args.save = '{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) create_exp_dir(args.save, scripts_to_save=['main.py', 'model.py']) def logging(s, print_=True, log_=True): if print_: print(s) if log_: with open(os.path.join(args.save, 'log.txt'), 'a+') as f_log: f_log.write(s + '\n') # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda:
help='rank of process') parser.add_argument('--world_size', type=int, default=1, help='number of gpus') parser.add_argument('--seed', type=int, default=1, help='seed used for initialization') parser.add_argument('--master_address', type=str, default='127.0.0.1', help='address for master') args = parser.parse_args() utils.create_exp_dir(args.save) size = args.world_size if size > 1: args.distributed = True processes = [] for rank in range(size): args.local_rank = rank p = Process(target=init_processes, args=(rank, size, main, args)) p.start() processes.append(p) for p in processes: p.join() else:
def __init__(self, args, sub_dir_path=None): super(CNNSearchPolicy, self).__init__() self.args = args # initialize path and logger if not self.args.continue_train: self.sub_directory_path = sub_dir_path or '{}_SEED_{}'.format( self.args.supernet_train_method, self.args.seed) self.exp_dir = os.path.join(self.args.main_path, self.sub_directory_path) utils.create_exp_dir(self.exp_dir) utils.save_json(args, self.exp_dir + '/args.json') if self.args.visualize: self.viz_dir_path = utils.create_viz_dir(self.exp_dir) if self.args.tensorboard: self.tb_dir = self.exp_dir tboard_dir = os.path.join(self.args.tboard_dir, self.sub_directory_path) self.writer = SummaryWriter(tboard_dir) if self.args.debug: torch.autograd.set_detect_anomaly(True) # Set logger and directory. self.logger = utils.get_logger( "train_search", file_handler=utils.get_file_handler( os.path.join(self.exp_dir, 'log.txt')), level=logging.INFO if not args.debug else logging.DEBUG) # Random seed should be set once the Policy is created. logging.info(f"setting random seed as {args.seed}") utils.torch_random_seed(args.seed) logging.info('gpu number = %d' % args.gpus) logging.info("args = %s", args) # metrics to track # self.ranking_per_epoch = OrderedDict() self.search_space = None # store the search space. self.model = None # store the model self.model_fn = None self.running_stats = OrderedDict() # store all running status. # to log the training results. self.logging_fn = self.logging_at_epoch if args.supernet_train_method in ['darts', 'spos']: """ Fundamental baseline training methods sample 1 architecture per batch train supernet Conv op has maximum possible filter channels (== output size of cell) Random a chunk of it. """ train_fn = procedure_ops.darts_train_model self.train_fn = partial(train_fn, args=self.args, architect=None, sampler=self.random_sampler) self.eval_fn = partial(procedure_ops.darts_model_validation, args=self.args) elif args.supernet_train_method == 'fairnas': """ Extend darts training method with FairNas strategy. It is not possible to use directly the FairNAS, but we can extend it into 2 method. """ train_fn = procedure_ops.fairnas_train_model_v1 self.train_fn = partial(train_fn, args=self.args, architect=None, topology_sampler=self.random_sampler, op_sampler=self.op_sampler) self.eval_fn = partial(procedure_ops.darts_model_validation, args=self.args) else: pass
def run(net, init_ch=32, layers=20, auxiliary=True, lr=0.025, momentum=0.9, wd=3e-4, cutout=True, cutout_length=16, data='../data', batch_size=96, epochs=600, drop_path_prob=0.2, auxiliary_weight=0.4): save = '/checkpoint/linnanwang/nasnet/' + hashlib.md5( json.dumps(net).encode()).hexdigest() utils.create_exp_dir(save, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) np.random.seed(0) torch.cuda.set_device(0) cudnn.benchmark = True cudnn.enabled = True torch.manual_seed(0) logging.info('gpu device = %d' % 0) # logging.info("args = %s", args) genotype = net model = Network(init_ch, 10, layers, auxiliary, genotype).cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=wd) model, optimizer = apex.amp.initialize(model, optimizer, opt_level="O3") train_transform, valid_transform = utils._data_transforms_cifar10( cutout, cutout_length) train_data = dset.CIFAR10(root=data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=data, train=False, download=True, transform=valid_transform) train_queue = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(epochs)) best_acc = 0.0 for epoch in range(epochs): scheduler.step() logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.drop_path_prob = drop_path_prob * epoch / epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer, auxiliary=auxiliary, auxiliary_weight=auxiliary_weight) logging.info('train_acc: %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc: %f', valid_acc) if valid_acc > best_acc and epoch >= 50: print('this model is the best') torch.save(model.state_dict(), os.path.join(save, 'model.pt')) if valid_acc > best_acc: best_acc = valid_acc print('current best acc is', best_acc) if epoch == 100: break # utils.save(model, os.path.join(args.save, 'trained.pt')) print('saved to: model.pt') return best_acc
parser.add_argument('--max_seq_len_delta', type=int, default=40, help='max sequence length') parser.add_argument('--single_gpu', default=False, action='store_true', help='use single GPU') args = parser.parse_args() if args.nhidlast < 0: args.nhidlast = args.emsize if args.dropoutl < 0: args.dropoutl = args.dropouth if args.small_batch_size < 0: args.small_batch_size = args.batch_size if not args.continue_train: args.save = '{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) create_exp_dir(args.save, scripts_to_save=['main.py', 'model.py']) def logging(s, print_=True, log_=True): if print_: print(s) if log_: with open(os.path.join(args.save, 'log.txt'), 'a+') as f_log: f_log.write(s + '\n') # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else:
parser.add_argument('--net1_name', type=str, required=True, help='name of net1') # resnet20/resnet110 parser.add_argument('--net2_name', type=str, required=True, help='name of net2') # resnet20/resnet110 # hyperparameter lambda parser.add_argument('--lambda_kd', type=float, default=1.0) args, unparsed = parser.parse_known_args() args.save_root = os.path.join(args.save_root, args.note) create_exp_dir(args.save_root) log_format = '%(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format) fh = logging.FileHandler(os.path.join(args.save_root, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) def main(): np.random.seed(args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True
def __init__(self, args): self.args = args if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) if self.args.distributed: # Init distributed environment self.rank, self.world_size, self.device = init_dist( port=self.args.port) self.seed = self.rank * self.args.seed else: torch.cuda.set_device(self.args.gpu) self.device = torch.device("cuda") self.rank = 0 self.seed = self.args.seed self.world_size = 1 if self.args.fix_seedcudnn: random.seed(self.seed) torch.backends.cudnn.deterministic = True np.random.seed(self.seed) cudnn.benchmark = False torch.manual_seed(self.seed) cudnn.enabled = True torch.cuda.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) else: np.random.seed(self.seed) cudnn.benchmark = True torch.manual_seed(self.seed) cudnn.enabled = True torch.cuda.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.path = os.path.join(generate_date, self.args.save) if self.rank == 0: utils.create_exp_dir(generate_date, self.path, scripts_to_save=glob.glob('*.py')) logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(self.path, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info("self.args = %s", self.args) self.logger = tensorboardX.SummaryWriter( './runs/' + generate_date + '/nas_{}'.format(self.args.remark)) else: self.logger = None # set default resource_lambda for different methods if self.args.resource_efficient: if self.args.method == 'policy_gradient': if self.args.log_penalty: default_resource_lambda = 1e-4 else: default_resource_lambda = 1e-5 if self.args.method == 'reparametrization': if self.args.log_penalty: default_resource_lambda = 1e-2 else: default_resource_lambda = 1e-5 if self.args.method == 'discrete': if self.args.log_penalty: default_resource_lambda = 1e-2 else: default_resource_lambda = 1e-4 if self.args.resource_lambda == default_lambda: self.args.resource_lambda = default_resource_lambda #initialize loss function self.criterion = nn.CrossEntropyLoss().to(self.device) #initialize model self.init_model() #calculate model param size if self.rank == 0: logging.info("param size = %fMB", utils.count_parameters_in_MB(self.model)) self.model._logger = self.logger self.model._logging = logging #initialize optimizer self.init_optimizer() #iniatilize dataset loader self.init_loaddata() self.update_theta = True self.update_alpha = True
def main(args): """Main training function.""" torch.cuda.set_device(args.device_id) if args.distributed: args.distributed_rank = args.device_id distributed_init(args) if args.seed is not None: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) options.setup_device(args) ############################################################################ # Experiment & Logging ############################################################################ if is_master(args): if args.resume: # rank-0 device creates experiment dir and log to the file logging = utils.get_logger(os.path.join(args.model_dir, "log.txt"), log_=not args.debug) else: # rank-0 device creates experiment dir and log to the file logging = utils.create_exp_dir(args.model_dir, debug=args.debug) else: # other devices only log to console (print) but not the file logging = utils.get_logger(log_path=None, log_=False) ############################################################################ # Load data ############################################################################ logging("Loading data..") loaded_data, label_dict = data.load_data(args) args.num_class = len(label_dict) logging("Loading finish") tr_data, va_data, te_data = loaded_data va_loader = data.BucketIterator(va_data, args.valid_bsz, args.pad_id, args.seg_id_pad, args.device, args.max_length) te_loader = data.BucketIterator(te_data, args.test_bsz, args.pad_id, args.seg_id_pad, args.device, args.max_length) options.setup_device(args) args.model_path = os.path.join(args.model_dir, "model.pt") args.var_path = os.path.join(args.model_dir, "var.pt") args.config_path = os.path.join(args.model_dir, "net_config.json") train_step = 0 best_accuracy = -float("inf") # create model if args.resume: logging("Resuming from {}...".format(args.model_dir)) net_config = modeling.ModelConfig.init_from_json( args.config_path, args) model = modeling.FunnelTFM(net_config, args) model_param, optimizer = torch.load(args.model_path, map_location="cpu") logging(model.load_state_dict(model_param, strict=False)) model = model.to(args.device) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(args.device) best_accuracy, train_step = torch.load(args.var_path) else: # create new model if args.init_ckpt: logging("Init from ckpt {}".format(args.init_ckpt)) net_config = modeling.ModelConfig.init_from_json( args.init_ckpt_config, args) model = modeling.FunnelTFM(net_config, args) print( model.load_state_dict(torch.load(args.init_ckpt), strict=False)) else: logging("init model") net_config = modeling.ModelConfig.init_from_args(args) model = modeling.FunnelTFM(net_config, args) net_config.to_json(args.config_path) model = model.to(args.device) # create new optimizer if args.fp16: from apex.optimizers import FusedAdam import apex.amp as amp optimizer = FusedAdam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) amp_model, optimizer = amp.initialize(model, optimizer, opt_level=args.amp_opt) else: try: from apex.optimizers import FusedAdam optimizer = FusedAdam(model.parameters(), lr=args.lr, betas=(0.9, 0.99), eps=1e-6, weight_decay=args.weight_decay) except ImportError as e: logging("use pytorch optimizer") optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.99), eps=1e-6, weight_decay=args.weight_decay) amp_model = model if args.distributed: if args.ddp_backend == "apex": from apex.parallel import DistributedDataParallel as DDP para_model = DDP(amp_model) else: from torch.nn.parallel import DistributedDataParallel as DDP para_model = DDP(amp_model, device_ids=[args.device_id], find_unused_parameters=True) else: para_model = amp_model ############################################################################ # Log args ############################################################################ logging("=" * 100) for k, v in args.__dict__.items(): logging(" - {} : {}".format(k, v)) logging("=" * 100) ############################################################################ # Training ############################################################################ if not args.test_only: tr_loader = data.BucketIterator(tr_data, args.train_bsz, args.pad_id, args.seg_id_pad, args.device, args.max_length) if args.distributed: num_data = len(tr_data) // args.distributed_world_size else: num_data = len(tr_data) num_tr_batch = (num_data + args.train_bsz - 1) // args.train_bsz args.train_steps = num_tr_batch * args.epochs args.warmup_steps = int(args.train_steps * args.warmup_prop) num_example = torch.Tensor([0]).to(args.device) num_correct = torch.Tensor([0]).to(args.device) if args.dataset in ["CoLA"]: num_tp = torch.Tensor([0]).to(args.device) num_fp = torch.Tensor([0]).to(args.device) num_tn = torch.Tensor([0]).to(args.device) num_fn = torch.Tensor([0]).to(args.device) for epoch in range(args.epochs): #### One epoch for i, (sent, seg_id, label) in enumerate( tr_loader.get_iter(epoch, distributed=args.distributed)): optimizer.zero_grad() _, ret_dict = para_model(sent, seg_id=seg_id, cls_target=label) cls_loss = ret_dict["cls_loss"] cls_corr = ret_dict["cls_corr"] if args.fp16: with amp.scale_loss(cls_loss, optimizer) as scaled_loss: scaled_loss.backward() else: cls_loss.backward() num_correct += cls_corr.detach() num_example += len(sent) if args.dataset in ["CoLA"]: tp, fp, tn, fn = confusion_matrix(ret_dict["cls_pred"], label) num_tp = num_tp + tp num_fp = num_fp + fp num_tn = num_tn + tn num_fn = num_fn + fn if args.clip > 0: if args.fp16: gnorm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.clip) else: gnorm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip) else: gnorm = 0 for p in model.parameters(): if p.grad is not None: param_gnorm = p.grad.data.norm(2) gnorm += param_gnorm.item()**2 gnorm = gnorm**(1. / 2) train_step += 1 adjust_lr(args, train_step, optimizer) optimizer.step() ##### training stat if (i + 1) % (num_tr_batch // args.n_log_epoch) == 0: if args.distributed: torch.distributed.all_reduce( num_correct, op=torch.distributed.ReduceOp.SUM) torch.distributed.all_reduce( num_example, op=torch.distributed.ReduceOp.SUM) if args.dataset in ["CoLA"]: torch.distributed.all_reduce( num_tp, op=torch.distributed.ReduceOp.SUM) torch.distributed.all_reduce( num_fp, op=torch.distributed.ReduceOp.SUM) torch.distributed.all_reduce( num_tn, op=torch.distributed.ReduceOp.SUM) torch.distributed.all_reduce( num_fn, op=torch.distributed.ReduceOp.SUM) if is_master(args): if args.dataset in ["CoLA"]: corref = _compute_metric_based_on_keys( "corr", num_tp.item(), num_fp.item(), num_tn.item(), num_fn.item()) logging( "[{:>02d}/{:>08d}] Train | corref {:.4f} | gnorm {:.2f} " "| lr {:.6f}".format( epoch, train_step, corref, gnorm, optimizer.param_groups[0]["lr"])) else: accuracy = num_correct.item() / num_example.item() logging( "[{:>02d}/{:>08d}] Train | accu {:.4f} | gnorm {:.2f} " "| lr {:.6f}".format( epoch, train_step, accuracy, gnorm, optimizer.param_groups[0]["lr"])) num_example.zero_() num_correct.zero_() if args.dataset in ["CoLA"]: num_tp.zero_() num_fp.zero_() num_tn.zero_() num_fn.zero_() ##### validation if train_step % (args.train_steps // 10) == 0: accuracy = evaluate(args, model, va_loader) if is_master(args): if accuracy > best_accuracy: torch.save([model.state_dict(), optimizer], args.model_path) torch.save([best_accuracy, train_step], args.var_path) best_accuracy = max(accuracy, best_accuracy) logging( "[{}] Valid | curr accu {:.4f} | best accu {:.4f}". format(train_step // (args.train_steps // 10), accuracy, best_accuracy)) ##### make prediction if is_master(args) and args.write_prediction: rev_label_dict = dict((v, k) for k, v in label_dict.items()) model.load_state_dict(torch.load(args.model_path, map_location="cpu")[0], strict=False) model = model.to(args.device) predict(args, model, te_loader, os.path.join(args.model_dir, "test_results.txt"), rev_label_dict) predict(args, model, va_loader, os.path.join(args.model_dir, "valid_results.txt"), rev_label_dict)
if args.save: if args.dataset in ["CoraFull", "Computers", "Photo", "CS"]: nsave = "log/{}-{}/sample-{}/{}".format(args.dataset, args.train_num, args.sample, args.complete) else: if not args.keep_train_num: nsave = "log/{}/sample-{}/{}".format(args.dataset, args.sample, args.complete) else: nsave = "log/{}-keep/sample-{}/{}".format(args.dataset, args.sample, args.complete) else: print("not saving file") nsave = "log/trash/{}".format(args.complete) create_exp_dir(nsave) #, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p', filemode="w") nfile = "para{}-nhid{}-lr{}-lrg{}-hidg{}-wd{}-dr{}-layer{}-norm{}-seed{}-{}".format( args.compl_param, args.nhid, args.lr, args.lr_graph, args.hid_graph, args.wd, args.dropout, args.layertype, args.normalize, args.seed, args.dataseed) fh = logging.FileHandler(os.path.join(nsave, nfile + ".txt"), "w") fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) dataset = load_dataset(args.dataset)
def __init__(self, args): self.args = args if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) torch.cuda.set_device(self.args.gpu) self.device = torch.device("cuda") self.rank = 0 self.seed = self.args.seed self.world_size = 1 if self.args.fix_cudnn: random.seed(self.seed) torch.backends.cudnn.deterministic = True np.random.seed(self.seed) cudnn.benchmark = False torch.manual_seed(self.seed) cudnn.enabled = True torch.cuda.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) else: np.random.seed(self.seed) cudnn.benchmark = True torch.manual_seed(self.seed) cudnn.enabled = True torch.cuda.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.path = os.path.join(generate_date, self.args.save) if self.rank == 0: utils.create_exp_dir(generate_date, self.path, scripts_to_save=glob.glob('*.py')) logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(self.path, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info("self.args = %s", self.args) self.logger = tensorboardX.SummaryWriter('./runs/' + generate_date + '/' + self.args.save_log) else: self.logger = None #initialize loss function self.criterion = nn.CrossEntropyLoss().to(self.device) #initialize model self.init_model() if self.args.resume: self.reload_model() #calculate model param size if self.rank == 0: logging.info("param size = %fMB", utils.count_parameters_in_MB(self.model)) self.model._logger = self.logger self.model._logging = logging #initialize optimizer self.init_optimizer() #iniatilize dataset loader self.init_loaddata() self.update_theta = True self.update_alpha = True
u = "c_{k-1}" else: u = str(j - 2) v = str(i) g.edge(u, v, label=op, fillcolor="gray") g.node("c_{k}", fillcolor='palegoldenrod') for i in range(steps): g.edge(str(i), "c_{k}", fillcolor="gray") g.render(filename, view=True) import os from utils import create_exp_dir if __name__ == '__main__': if len(sys.argv) < 2: print("usage:\n python {} ARCH_NAME".format(sys.argv[0])) sys.exit(1) genotype_name = sys.argv[1] file_path = './vis/' + genotype_name create_exp_dir(file_path) try: genotype = eval('genotypes.{}'.format(genotype_name)) except AttributeError: print("{} is not specified in genotypes.py".format(genotype_name)) sys.exit(1) plot1(genotype.normal, os.path.join(file_path, "normal")) plot1(genotype.reduce, os.path.join(file_path, "reduction"))
default=0.9, help='learning rate for arch encoding') parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') args = parser.parse_args() args.save = './logs/search/search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(args.save, scripts_to_save=None) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) CIFAR_CLASSES = 10 def main(): if not torch.cuda.is_available():
######################## assert args.training_split_num >= args.valid_per_epoch if args.small_batch_size < 0: args.small_batch_size = args.batch_size assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' if args.coeff_opt == 'maxlc': current_coeff_opt = 'max' else: current_coeff_opt = args.coeff_opt if not args.continue_train: args.save = '{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) create_exp_dir(args.save, scripts_to_save=['./src/main_train_topics.py', './src/model.py', './src/nsd_loss.py']) def logging(s, print_=True, log_=True): if print_: print(s) sys.stdout.flush() if log_: with open(os.path.join(args.save, 'log.txt'), 'a+') as f_log: f_log.write(s + '\n') # Set the random seed manually for reproducibility. seed_all_randomness(args.seed,args.cuda) logging('Args: {}'.format(args))
def main(): args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) CIFAR_CLASSES = 10 if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD( model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs) for epoch in range(args.epochs): lr = scheduler.get_last_lr()[0] logging.info('epoch %d lr %e', epoch, lr) model.drop_path_prob = args.drop_path_prob * epoch / args.epochs # training train_acc, train_obj = train(train_queue, model, criterion, optimizer) logging.info('train_acc %f', train_acc) # validation valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) # スケジューラの更新 scheduler.step() utils.save(model, os.path.join(args.save, 'weights.pt'))
help='Dataset to use if you are using without warmstarting') args, unknowns = cmdline_parser.parse_known_args() log_lvl = logging.INFO if args.verbose == 'INFO' else logging.DEBUG logging.basicConfig(level=log_lvl, stream=sys.stdout) if unknowns: logging.warning('Found unknown arguments!') logging.warning(str(unknowns)) logging.warning('These will be ignored') exp_dir = 'experiment-{}-{}'.format( args.methods, datetime.now().strftime("%Y%m%d-%H%M%S%f")) utils.create_exp_dir(exp_dir) genotype = config = None if (args.methods == 'DARTS' or args.methods == 'BOTH'): logging.info('\n###### NAS w/ DARTS ######\n') start = time.time() darts.main(exp_dir) architecture_res = exp_dir + '/arch' with open(architecture_res, 'rb') as f: genotype = pickle.load(f) end = time.time() logging.info('\nTime elapsed for DARTS: %.0f sec\n', (end - start)) else: genotype = eval(str("genotypes." + args.genotype))
def main(): global args, best_prec1 args = parser.parse_args() with open(args.config) as f: config = yaml.load(f) for key in config: for k, v in config[key].items(): setattr(args, k, v) print('Enabled distributed training.') # rank, world_size = init_dist( # backend='nccl', port=args.port) # args.rank = rank # args.world_size = world_size args.rank = 0 args.world_size = 8 np.random.seed(args.seed*args.rank) torch.manual_seed(args.seed*args.rank) torch.cuda.manual_seed(args.seed*args.rank) torch.cuda.manual_seed_all(args.seed*args.rank) print('random seed: ', args.seed*args.rank) # create model print("=> creating model '{}'".format(args.model)) if args.SinglePath: architecture = 20*[0] channels_scales = 20*[1.0] model = ShuffleNetV2_OneShot(args=args, architecture=architecture, channels_scales=channels_scales) model.cuda() #broadcast_params(model) for v in model.parameters(): if v.requires_grad: if v.grad is None: v.grad = torch.zeros_like(v) model.log_alpha.grad = torch.zeros_like(model.log_alpha) criterion = CrossEntropyLoss(smooth_eps=0.1, smooth_dist=(torch.ones(1000)*0.001).cuda()).cuda() wo_wd_params = [] wo_wd_param_names = [] network_params = [] network_param_names = [] for name, mod in model.named_modules(): if isinstance(mod, nn.BatchNorm2d): for key, value in mod.named_parameters(): wo_wd_param_names.append(name+'.'+key) for key, value in model.named_parameters(): if key != 'log_alpha': if value.requires_grad: if key in wo_wd_param_names: wo_wd_params.append(value) else: network_params.append(value) network_param_names.append(key) params = [ {'params': network_params, 'lr': args.base_lr, 'weight_decay': args.weight_decay }, {'params': wo_wd_params, 'lr': args.base_lr, 'weight_decay': 0.}, ] param_names = [network_param_names, wo_wd_param_names] if args.rank == 0: print('>>> params w/o weight decay: ', wo_wd_param_names) optimizer = torch.optim.SGD(params, momentum=args.momentum) if args.SinglePath: arch_optimizer = torch.optim.Adam( [param for name, param in model.named_parameters() if name == 'log_alpha'], lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay ) # auto resume from a checkpoint remark = 'imagenet_' remark += 'epo_' + str(args.epochs) + '_layer_' + str(args.layers) + '_batch_' + str(args.batch_size) + '_lr_' + str(args.base_lr) + '_seed_' + str(args.seed) if args.early_fix_arch: remark += '_early_fix_arch' if args.flops_loss: remark += '_flops_loss_' + str(args.flops_loss_coef) if args.remark != 'none': remark += '_'+args.remark args.save = 'search-{}-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"), remark) args.save_log = 'nas-{}-{}'.format(time.strftime("%Y%m%d-%H%M%S"), remark) generate_date = str(datetime.now().date()) path = os.path.join(generate_date, args.save) if args.rank == 0: log_format = '%(asctime)s %(message)s' utils.create_exp_dir(generate_date, path, scripts_to_save=glob.glob('*.py')) logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(path, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info("args = %s", args) writer = SummaryWriter('./runs/' + generate_date + '/' + args.save_log) else: writer = None model_dir = path start_epoch = 0 if args.evaluate: load_state_ckpt(args.checkpoint_path, model) else: best_prec1, start_epoch = load_state(model_dir, model, optimizer=optimizer) cudnn.benchmark = True cudnn.enabled = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize]) train_dataset = datasets.ImageNet(split='train', transform=transform) transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize]) train_dataset_wo_ms = datasets.ImageNet(split='train', transform=transform) transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize]) val_dataset = datasets.ImageNet(split='val', transform=transform) # train_sampler = DistributedSampler(train_dataset) # val_sampler = DistributedSampler(val_dataset) # # train_loader = DataLoader( # train_dataset, batch_size=args.batch_size//args.world_size, shuffle=False, # num_workers=args.workers, pin_memory=False, sampler=train_sampler) # # train_loader_wo_ms = DataLoader( # train_dataset_wo_ms, batch_size=args.batch_size//args.world_size, shuffle=False, # num_workers=args.workers, pin_memory=False, sampler=train_sampler) # # val_loader = DataLoader( # val_dataset, batch_size=50, shuffle=False, # num_workers=args.workers, pin_memory=False, sampler=val_sampler) train_loader = DataLoader( train_dataset, batch_size=args.batch_size//args.world_size, shuffle=False, num_workers=args.workers, pin_memory=False) train_loader_wo_ms = DataLoader( train_dataset_wo_ms, batch_size=args.batch_size//args.world_size, shuffle=False, num_workers=args.workers, pin_memory=False) val_loader = DataLoader( val_dataset, batch_size=50, shuffle=False, num_workers=args.workers, pin_memory=False) if args.evaluate: validate(val_loader, model, criterion, 0, writer, logging) return niters = len(train_loader) lr_scheduler = LRScheduler(optimizer, niters, args) for epoch in range(start_epoch, args.epochs): #train_sampler.set_epoch(epoch) if args.early_fix_arch: if len(model.fix_arch_index.keys()) > 0: for key, value_lst in model.fix_arch_index.items(): model.log_alpha.data[key, :] = value_lst[1] sort_log_alpha = torch.topk(F.softmax(model.log_alpha.data, dim=-1), 2) argmax_index = (sort_log_alpha[0][:,0] - sort_log_alpha[0][:,1] >= 0.3) for id in range(argmax_index.size(0)): if argmax_index[id] == 1 and id not in model.fix_arch_index.keys(): model.fix_arch_index[id] = [sort_log_alpha[1][id,0].item(), model.log_alpha.detach().clone()[id, :]] if args.rank == 0 and args.SinglePath: logging.info('epoch %d', epoch) logging.info(model.log_alpha) logging.info(F.softmax(model.log_alpha, dim=-1)) logging.info('flops %fM', model.cal_flops()) # train for one epoch if epoch >= args.epochs - 5 and args.lr_mode == 'step' and args.off_ms: train(train_loader_wo_ms, model, criterion, optimizer, arch_optimizer, lr_scheduler, epoch, writer, logging) else: train(train_loader, model, criterion, optimizer, arch_optimizer, lr_scheduler, epoch, writer, logging) # evaluate on validation set prec1 = validate(val_loader, model, criterion, epoch, writer, logging) if args.gen_max_child: args.gen_max_child_flag = True prec1 = validate(val_loader, model, criterion, epoch, writer, logging) args.gen_max_child_flag = False if args.rank == 0: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint(model_dir, { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def train_model(args): if os.path.isdir(args.save) == False: os.makedirs(args.save) save_dir = '{}eval-{}-{}'.format(args.save, args.note, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(save_dir, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(save_dir, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) if args.cifar100: CIFAR_CLASSES = 100 data_folder = 'cifar-100-python' else: CIFAR_CLASSES = 10 data_folder = 'cifar-10-batches-py' if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) logging.info("unparsed args = %s", unparsed) num_gpus = torch.cuda.device_count() if args.arch in genotypes.__dict__.keys(): genotype = eval("genotypes.%s" % args.arch) else: genotype = eval(args.arch) print('---------Genotype---------') logging.info(genotype) print('--------------------------') model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) model = torch.nn.DataParallel(model) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) if args.cifar100: train_transform, valid_transform = utils._data_transforms_cifar100( args) else: train_transform, valid_transform = utils._data_transforms_cifar10(args) if args.cifar100: train_data = dset.CIFAR100(root=args.tmp_data_dir, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR100(root=args.tmp_data_dir, train=False, download=True, transform=valid_transform) else: train_data = dset.CIFAR10(root=args.tmp_data_dir, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.tmp_data_dir, train=False, download=True, transform=valid_transform) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.workers) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.workers) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) best_acc = 0.0 for epoch in range(args.epochs): scheduler.step() logging.info('Epoch: %d lr %e', epoch, scheduler.get_lr()[0]) model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs model.drop_path_prob = args.drop_path_prob * epoch / args.epochs start_time = time.time() train_acc, train_obj = train(train_queue, model, criterion, optimizer) logging.info('Train_acc: %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) if valid_acc > best_acc: best_acc = valid_acc logging.info('Valid_acc: %f', valid_acc) logging.info('Best_acc: %f', best_acc) end_time = time.time() duration = end_time - start_time print('Epoch time: %ds.' % duration) utils.save(model.module, os.path.join(save_dir, 'weights.pt'))
def main(): global cfg, rank, world_size cfg = Config.fromfile(args.config) # Set seed np.random.seed(cfg.seed) cudnn.benchmark = True torch.manual_seed(cfg.seed) cudnn.enabled = True torch.cuda.manual_seed(cfg.seed) # Model print('==> Building model..') arch_code = eval('architecture_code.{}'.format(cfg.model)) net = models.model_entry(cfg, arch_code) rank = 0 # for non-distributed world_size = 1 # for non-distributed if args.distributed: print('==> Initializing distributed training..') init_dist( launcher='slurm', backend='nccl' ) # Only support slurm for now, if you would like to personalize your launcher, please refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py rank, world_size = get_dist_info() net = net.cuda() cfg.netpara = sum(p.numel() for p in net.parameters()) / 1e6 start_epoch = 0 best_acc = 0 # Load checkpoint. if cfg.get('resume_path', False): print('==> Resuming from {}checkpoint {}..'.format( ('original ' if cfg.resume_path.origin_ckpt else ''), cfg.resume_path.path)) if cfg.resume_path.origin_ckpt: utils.load_state(cfg.resume_path.path, net, rank=rank) else: if args.distributed: net = torch.nn.parallel.DistributedDataParallel( net, device_ids=[torch.cuda.current_device()], output_device=torch.cuda.current_device()) utils.load_state(cfg.resume_path.path, net, rank=rank) # Data print('==> Preparing data..') trainloader, testloader, train_sampler, test_sampler = dataset_entry( cfg, args.distributed) criterion = nn.CrossEntropyLoss() if not args.eval_only: cfg.attack_param.num_steps = 7 net_adv = AttackPGD(net, cfg.attack_param) # Train params print('==> Setting train parameters..') train_param = cfg.train_param epochs = train_param.epochs init_lr = train_param.learning_rate if train_param.get('warm_up_param', False): warm_up_param = train_param.warm_up_param init_lr = warm_up_param.warm_up_base_lr epochs += warm_up_param.warm_up_epochs if train_param.get('no_wd', False): param_group, type2num, _, _ = utils.param_group_no_wd(net) cfg.param_group_no_wd = type2num optimizer = torch.optim.SGD(param_group, lr=init_lr, momentum=train_param.momentum, weight_decay=train_param.weight_decay) else: optimizer = torch.optim.SGD(net.parameters(), lr=init_lr, momentum=train_param.momentum, weight_decay=train_param.weight_decay) scheduler = lr_scheduler.CosineLRScheduler( optimizer, epochs, train_param.learning_rate_min, init_lr, train_param.learning_rate, (warm_up_param.warm_up_epochs if train_param.get( 'warm_up_param', False) else 0)) # Log print('==> Writing log..') if rank == 0: cfg.save = '{}/{}-{}-{}'.format(cfg.save_path, cfg.model, cfg.dataset, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(cfg.save) logger = utils.create_logger('global_logger', cfg.save + '/log.txt') logger.info('config: {}'.format(pprint.pformat(cfg))) # Evaluation only if args.eval_only: assert cfg.get( 'resume_path', False), 'Should set the resume path for the eval_only mode' print('==> Testing on Clean Data..') test(net, testloader, criterion) print('==> Testing on Adversarial Data..') test(net_adv, testloader, criterion, adv=True) return # Training process for epoch in range(start_epoch, epochs): train_sampler.set_epoch(epoch) test_sampler.set_epoch(epoch) scheduler.step() if rank == 0: logger.info('Epoch %d learning rate %e', epoch, scheduler.get_lr()[0]) # Train for one epoch train(net_adv, trainloader, criterion, optimizer) # Validate for one epoch valid_acc = test(net_adv, testloader, criterion, adv=True) if rank == 0: logger.info('Validation Accuracy: {}'.format(valid_acc)) is_best = valid_acc > best_acc best_acc = max(valid_acc, best_acc) print('==> Saving') state = { 'epoch': epoch, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'state_dict': net.state_dict(), 'scheduler': scheduler } utils.save_checkpoint(state, is_best, os.path.join(cfg.save))
parser.add_argument('--channels_last', type=str, default='False') # others parser.add_argument('--seed', type=int, default=2, help='random seed') parser.add_argument('--note', type=str, default='try', help='note for this run') args, unparsed = parser.parse_known_args() args.channels_last = eval(args.channels_last) args.save = os.path.join( args.save, '{}-{}'.format(time.strftime("%Y%m%d-%H%M%S"), args.note)) if args.local_rank == 0: create_exp_dir(args.save, scripts_to_save=glob.glob('*.py') + glob.glob('*.sh')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) if hasattr(torch, 'channels_last') and hasattr(torch, 'contiguous_format'): if args.channels_last: memory_format = torch.channels_last else: memory_format = torch.contiguous_format
action='store_true', default=False, help='use one-step unrolled validation loss') parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') args = parser.parse_args() args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) # 生成search目录 utils.create_exp_dir( args.save, scripts_to_save=glob.glob('*.py')) # 把cnn内所有py脚本拷到search目录里 # glob.glob()查找符合特定规则的文件路径名 ''' log ''' log_format = '%(asctime)s %(message)s' # %(asctime)s 当前时间,%(message)s 用户输出的消息 logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) CIFAR_CLASSES = 10
def main(): global args, best_prec1 args = parser.parse_args() with open(args.config) as f: config = yaml.load(f) for key in config: for k, v in config[key].items(): setattr(args, k, v) print('Enabled distributed training.') rank, world_size = init_dist( backend='nccl', port=args.port) args.rank = rank args.world_size = world_size np.random.seed(args.seed*args.rank) torch.manual_seed(args.seed*args.rank) torch.cuda.manual_seed(args.seed*args.rank) torch.cuda.manual_seed_all(args.seed*args.rank) # create model print("=> creating model '{}'".format(args.model)) if args.SinglePath: architecture = 20*[0] channels_scales = 20*[1.0] #load derived child network log_alpha = torch.load(args.checkpoint_path, map_location='cuda:{}'.format(torch.cuda.current_device()))['state_dict']['log_alpha'] weights = torch.zeros_like(log_alpha).scatter_(1, torch.argmax(log_alpha, dim = -1).view(-1,1), 1) model = ShuffleNetV2_OneShot(args=args, architecture=architecture, channels_scales=channels_scales, weights=weights) model.cuda() broadcast_params(model) for v in model.parameters(): if v.requires_grad: if v.grad is None: v.grad = torch.zeros_like(v) model.log_alpha.grad = torch.zeros_like(model.log_alpha) if not args.retrain: load_state_ckpt(args.checkpoint_path, model) checkpoint = torch.load(args.checkpoint_path, map_location='cuda:{}'.format(torch.cuda.current_device())) args.base_lr = checkpoint['optimizer']['param_groups'][0]['lr'] if args.reset_bn_stat: model._reset_bn_running_stats() # define loss function (criterion) and optimizer criterion = CrossEntropyLoss(smooth_eps=0.1, smooth_dist=(torch.ones(1000)*0.001).cuda()).cuda() wo_wd_params = [] wo_wd_param_names = [] network_params = [] network_param_names = [] for name, mod in model.named_modules(): #if isinstance(mod, (nn.BatchNorm2d, SwitchNorm2d)): if isinstance(mod, nn.BatchNorm2d): for key, value in mod.named_parameters(): wo_wd_param_names.append(name+'.'+key) for key, value in model.named_parameters(): if key != 'log_alpha': if value.requires_grad: if key in wo_wd_param_names: wo_wd_params.append(value) else: network_params.append(value) network_param_names.append(key) params = [ {'params': network_params, 'lr': args.base_lr, 'weight_decay': args.weight_decay }, {'params': wo_wd_params, 'lr': args.base_lr, 'weight_decay': 0.}, ] param_names = [network_param_names, wo_wd_param_names] if args.rank == 0: print('>>> params w/o weight decay: ', wo_wd_param_names) optimizer = torch.optim.SGD(params, momentum=args.momentum) arch_optimizer=None # auto resume from a checkpoint remark = 'imagenet_' remark += 'epo_' + str(args.epochs) + '_layer_' + str(args.layers) + '_batch_' + str(args.batch_size) + '_lr_' + str(float("{0:.2f}".format(args.base_ lr))) + '_seed_' + str(args.seed) if args.remark != 'none': remark += '_'+args.remark args.save = 'search-{}-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"), remark) args.save_log = 'nas-{}-{}'.format(time.strftime("%Y%m%d-%H%M%S"), remark) generate_date = str(datetime.now().date()) path = os.path.join(generate_date, args.save) if args.rank == 0: log_format = '%(asctime)s %(message)s' utils.create_exp_dir(generate_date, path, scripts_to_save=glob.glob('*.py')) logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(path, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info("args = %s", args) writer = SummaryWriter('./runs/' + generate_date + '/' + args.save_log) else: writer = None #model_dir = args.model_dir model_dir = path start_epoch = 0 if args.evaluate: load_state_ckpt(args.checkpoint_path, model) else: best_prec1, start_epoch = load_state(model_dir, model, optimizer=optimizer) cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = ImagenetDataset( args.train_root, args.train_source, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_dataset_wo_ms = ImagenetDataset( args.train_root, args.train_source, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) val_dataset = ImagenetDataset( args.val_root, args.val_source, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) train_sampler = DistributedSampler(train_dataset) val_sampler = DistributedSampler(val_dataset) train_loader = DataLoader( train_dataset, batch_size=args.batch_size//args.world_size, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler) train_loader_wo_ms = DataLoader( train_dataset_wo_ms, batch_size=args.batch_size//args.world_size, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler) val_loader = DataLoader( val_dataset, batch_size=50, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=val_sampler) if args.evaluate: validate(val_loader, model, criterion, 0, writer, logging) return niters = len(train_loader) lr_scheduler = LRScheduler(optimizer, niters, args) for epoch in range(start_epoch, args.epochs): train_sampler.set_epoch(epoch) if args.rank == 0 and args.SinglePath: logging.info('epoch %d', epoch) # evaluate on validation set after loading the model if epoch == 0 and not args.reset_bn_stat: prec1 = validate(val_loader, model, criterion, epoch, writer, logging) # train for one epoch if epoch >= args.epochs - 5 and args.lr_mode == 'step' and args.off_ms and args.retrain: train(train_loader_wo_ms, model, criterion, optimizer, arch_optimizer, lr_scheduler, epoch, writer, logging) else: train(train_loader, model, criterion, optimizer, arch_optimizer, lr_scheduler, epoch, writer, logging) # evaluate on validation set prec1 = validate(val_loader, model, criterion, epoch, writer, logging) if rank == 0: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint(model_dir, { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def save_table(self, path): create_exp_dir(os.path.dirname(path)) np.savez(path, sap_time=self.sap_time)
parser.add_argument('--grad_clip', type=float, default=5., help='gradient clipping') parser.add_argument('--label_smooth', type=float, default=0.1, help='label smoothing') parser.add_argument('--gamma', type=float, default=0.97, help='learning rate decay') parser.add_argument('--decay_period', type=int, default=1, help='epochs between two learning rate decays') parser.add_argument('--parallel', action='store_true', default=False, help='data parallelism') args = parser.parse_args() args.data = os.path.expanduser(args.data) os.makedirs(args.data, exist_ok=True) pt_output_dir = os.environ.get('PT_OUTPUT_DIR', '') if pt_output_dir: args.exp_path = pt_output_dir geno_path = os.path.join(os.path.expanduser(args.exp_path), 'darts_pytorch_imagenet_orig_search', 'genotype.txt') args.exp_path = os.path.join(os.path.expanduser(args.exp_path), 'darts_pytorch_imagenet_orig_eval') args.exp_path = utils.create_exp_dir(args.exp_path, scripts_to_save=glob.glob('*.py')) args.seed = int(args.seed) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.exp_path, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) CLASSES = 1000 class CrossEntropyLabelSmooth(nn.Module):
parser.add_argument('--batch_size', type=int, default=128, help='batch_size') parser.add_argument('--lr', type=int, default=0.1, help='learning rate') parser.add_argument('--gpu', type=int, default=3, help='GPU device to use') args = parser.parse_args() # if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) # cudnn.benchmark = True # cudnn.enabled=True # torch.cuda.manual_seed_all(args.seed) # device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu") save_name = 'main-{}-{}'.format('EXP', time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(save_name, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join('results', save_name, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info('Args: {}'.format(args)) # logging.info(f"Using computation device: {device}")
type=str, default='EvNASA', help='which architecture to use') parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') args = parser.parse_args() args.save = 'eval-cifar100-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) #utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) if args.epochs == 100: args.save = 'eval-{}epochs'.format(args.epochs) if args.dir is not None: utils.create_exp_dir(os.path.join(args.dir, 'cifar100')) args.save = os.path.join(args.dir, 'cifar100', args.save) utils.create_exp_dir(args.save) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info('[INFO] torch version: {}, torchvision version: {}'.format( torch.__version__, torchvision.__version__))
def model_compress(args): if os.path.isdir(args.save) == False: os.makedirs(args.save) save_dir = '{}compress-{}-{}'.format(args.save, args.note, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(save_dir, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(save_dir, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) if args.cifar100: CIFAR_CLASSES = 100 data_folder = 'cifar-100-python' else: CIFAR_CLASSES = 10 data_folder = 'cifar-10-batches-py' if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) # prepare dataset if args.cifar100: train_transform, valid_transform = utils._data_transforms_cifar100( args) else: train_transform, valid_transform = utils._data_transforms_cifar10(args) if args.cifar100: train_data = dset.CIFAR100(root=args.train_data_dir, train=True, download=True, transform=train_transform) else: train_data = dset.CIFAR10(root=args.train_data_dir, train=True, download=True, transform=train_transform) num_train = len(train_data) iter_per_one_epoch = num_train // (2 * args.batch_size) if iter_per_one_epoch >= 100: train_extend_rate = 1 else: train_extend_rate = (100 // iter_per_one_epoch) + 1 iter_per_one_epoch = iter_per_one_epoch * train_extend_rate logging.info('num original train data: %d', num_train) logging.info('iter per one epoch: %d', iter_per_one_epoch) indices = list(range(num_train)) random.shuffle(indices) split = int(np.floor(args.train_portion * num_train)) train_set = torch.utils.data.Subset(train_data, indices[:split]) valid_set = torch.utils.data.Subset(train_data, indices[split:num_train]) train_set = torch.utils.data.ConcatDataset([train_set] * train_extend_rate) # valid_set = torch.utils.data.ConcatDataset([valid_set]*train_extend_rate) train_queue = torch.utils.data.DataLoader( train_set, batch_size=args.batch_size, sampler=torch.utils.data.sampler.RandomSampler(train_set), pin_memory=True, num_workers=args.workers) valid_queue = torch.utils.data.DataLoader( valid_set, batch_size=args.batch_size, sampler=torch.utils.data.sampler.RandomSampler(valid_set), pin_memory=True, num_workers=args.workers) # build Network criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() eps_no_arch = args.eps_no_archs epochs = args.epochs if args.arch in genotypes.__dict__.keys(): genotype = eval("genotypes.%s" % args.arch) else: genotype = eval(args.arch) model = Network(genotype, args.init_channels, CIFAR_CLASSES, args.layers, criterion, steps=args.inter_nodes, multiplier=args.inter_nodes, stem_multiplier=args.stem_multiplier, residual_connection=args.residual_connection) model = nn.DataParallel(model) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) network_params = [] for k, v in model.named_parameters(): if not (k.endswith('alphas_normal') or k.endswith('alphas_reduce')): network_params.append(v) optimizer = torch.optim.SGD(network_params, args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) optimizer_a = torch.optim.Adam(model.module.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(epochs), eta_min=args.learning_rate_min) scheduler_a = torch.optim.lr_scheduler.StepLR(optimizer_a, 30, gamma=0.2) train_epoch_record = -1 arch_train_count = 0 prev_geno = '' prev_rank = None rank_geno = None result_geno = None arch_stable = 0 best_arch_stable = 0 for epoch in range(epochs): lr = scheduler.get_lr()[0] logging.info('Epoch: %d lr: %e', epoch, lr) epoch_start = time.time() # training if epoch < eps_no_arch: train_acc, train_obj = train(train_queue, valid_queue, model, network_params, criterion, optimizer, optimizer_a, lr, train_arch=False) else: ops, probs = compressing_parse(model) concat = range(2, 2 + model.module._steps) genotype = Genotype( normal=ops[0], normal_concat=concat, reduce=ops[1], reduce_concat=concat, ) if str(prev_geno) != str(genotype): prev_geno = genotype logging.info(genotype) # early stopping stable_cond = True rank = [] for i in range(len(probs)): rank_tmp = ranking(probs[i]) rank.append(rank_tmp) if prev_rank != rank: stable_cond = False arch_stable = 0 prev_rank = rank rank_geno = genotype logging.info('rank: %s', rank) if stable_cond: arch_stable += 1 if arch_stable > best_arch_stable: best_arch_stable = arch_stable result_geno = rank_geno logging.info('arch_stable: %d', arch_stable) logging.info('best genotype: %s', rank_geno) if arch_stable >= args.stable_arch - 1: logging.info('stable genotype: %s', rank_geno) result_geno = rank_geno break train_acc, train_obj = train(train_queue, valid_queue, model, network_params, criterion, optimizer, optimizer_a, lr, train_arch=True) arch_train_count += 1 scheduler_a.step() scheduler.step() logging.info('Train_acc %f, Objs: %e', train_acc, train_obj) epoch_duration = time.time() - epoch_start logging.info('Epoch time: %ds', epoch_duration) # validation if epoch >= eps_no_arch: valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('Valid_acc %f, Objs: %e', valid_acc, valid_obj) # # early arch training # if train_epoch_record == -1: # if train_acc > 70: # arch_train_num = args.epochs - args.eps_no_archs # eps_no_arch = 0 # train_epoch_record = epoch # else: # if epoch >= train_epoch_record + arch_train_num: # break utils.save(model, os.path.join(save_dir, 'weights.pt')) # last geno parser ops, probs = compressing_parse(model) concat = range(2, 2 + model.module._steps) genotype = Genotype( normal=ops[0], normal_concat=concat, reduce=ops[1], reduce_concat=concat, ) logging.info('Last geno: %s', genotype) if result_geno == None: result_geno = genotype return result_geno, best_arch_stable
def main(): args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) CIFAR_CLASSES = 10 if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs, eta_min=args.learning_rate_min) architect = Architect(model, args) for epoch in range(args.epochs): lr = scheduler.get_last_lr()[0] logging.info('epoch %d lr %e', epoch, lr) genotype = model.genotype() logging.info('genotype = %s', genotype) print(F.softmax(model.alphas_normal, dim=-1)) print(F.softmax(model.alphas_reduce, dim=-1)) # training train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr) logging.info('train_acc %f', train_acc) # validation valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) # スケジューラの更新 scheduler.step() utils.save(model, os.path.join(args.save, 'weights.pt'))
# maxlen=args.maxlen, # fields=args.fields, # token_level=args.token_level, # vocab_size=args.vocab_size, # lowercase=args.lowercase, # cut_by_cnt=False, create_dict=False, # if_tokenize=False, if_vetorize=False) # save arguments ntokens = len(corpus.dictionary.word2idx) print("Vocabulary Size: {}".format(ntokens)) args.ntokens = ntokens # exp dir create_exp_dir(os.path.join(args.save), ['train_ae.py', 'models.py', 'utils.py'], dict=corpus.dictionary.word2idx, options=args) def logging(str, to_stdout=True): with open(os.path.join(args.save, 'log.txt'), 'a') as f: f.write(str + '\n') if to_stdout: print(str) logging(str(vars(args))) eval_batch_size = 32 train_batches_num = math.floor(corpus.train_num / args.batch_size) test_batches_num = math.floor(corpus.test_num / eval_batch_size)
def main(): seed = args.seed np.random.seed(seed) cudnn.benchmark = True torch.manual_seed(seed) cudnn.enabled = True torch.cuda.manual_seed(seed) timestamp = str(utils.get_unix_timestamp()) utils.makedirs(args.save) path = os.path.join(args.save, timestamp) utils.create_exp_dir(path, scripts_to_save=glob.glob('../*.py')) logger = utils.get_logger(args.save, timestamp, file_type='txt') utils.makedirs(os.path.join(path, 'logs')) logger.info("time = %s, args = %s", str(utils.get_unix_timestamp()), args) input_shape = [ 11, 9, 3 ] # MANUALLY SET NUMBER OF CHANNELS (11) ACCORDING TO PRETRAINING os.system('cp -f ../pretrain-weights.pt {}'.format( os.path.join(path, 'weights.pt'))) utils.makedirs(os.path.join(path, 'scripts')) os.system('cp -f ./for-copy/parse-ga.py {}'.format( os.path.join(path, 'scripts', 'parse-ga.py'))) os.system('cp -f ./for-copy/parse-ga.py {}'.format( os.path.join(path, 'scripts', 'parse-log.py'))) os.system('cp -f ./for-copy/parse_data.py {}'.format( os.path.join(path, 'scripts', 'parse_data.py'))) os.system('cp -f ./for-copy/optimization-plots.sh {}'.format( os.path.join(path, 'scripts', '1_optimization-plots.sh'))) # PyTorch criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) model = Network(input_shape, args.num_drones, criterion, path) model = model.to(device) utils.load(model, os.path.join(path, 'weights.pt')) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # PyGMO prob = pg.problem(genetic_algo.Flocking(path, timestamp, model)) pop = pg.population(prob, size=10, seed=24601) algo = pg.algorithm( pg.sga(gen=1, cr=.90, m=0.02, param_s=3, crossover="single", mutation="uniform", selection="truncated")) algo.set_verbosity(1) for i in range(29): logger.info( "time = %s gen = %d \n champ_f = %s \n champ_x = %s \n f_s = %s \n x_s = %s \n id_s = %s", str(utils.get_unix_timestamp()), i + 1, str(np.array(pop.champion_f).tolist()), str(np.array(pop.champion_x).tolist()), str(np.array(pop.get_f()).tolist()), str(np.array(pop.get_x()).tolist()), str(np.array(pop.get_ID()).tolist())) pop = algo.evolve(pop) model.online_update(path, genetic_algo.TS_LIST[-100:], input_shape, criterion, optimizer, logger, i) utils.save(model, os.path.join(path, 'weights.pt'))
default=0.2, help='drop path probability') parser.add_argument('--save', type=str, default='EXP', help='experiment name') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use') parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') args = parser.parse_args() args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) CIFAR_CLASSES = 10 def main(): if not torch.cuda.is_available():
parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument('--arch', type=str, default='SNAS_edge_all', help='which architecture to use') parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') args = parser.parse_args() print(args.arch) args.save = 'eval-{}-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"), args.arch) generate_date = str(datetime.now().date()) utils.create_exp_dir(generate_date, args.save, scripts_to_save=glob.glob('*.py')) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) CIFAR_CLASSES = 10 logger = tensorboardX.SummaryWriter('./runs/eval_{}'.format(args.arch))