def main(args): file_name = 'log_%s_%d' % ('gpus', args.gpu) logger = setup_logger(file_name, args.save_dir, args.gpu, log_level='DEBUG', filename='%s.txt' % file_name) logger.info(args) if args.search_space == 'darts': with open(args.darts_file_path, 'rb') as f: if args.darts_training_nums: all_data = pickle.load(f)[:args.darts_training_nums] else: all_data = pickle.load(f) else: nasbench_datas = data.build_datasets(args) all_data = data.dataset_all(args, nasbench_datas) for predictor in args.predictor_list: logger.info( f'================== predictor type: {predictor} ======================' ) predictor_unsupervised(args, predictor, all_data, train_epochs=args.epochs, logger=logger)
def data_consumers(args, q, save_dir, i, search_space): set_random_seed(int(str(time.time()).split('.')[0][::-1][:9])) file_name = 'log_%s_%d' % ('gpus', i) logger = setup_logger(file_name, save_dir, i, log_level='DEBUG', filename='%s.txt' % file_name) while True: msg = q.get() if msg == 'done': logger.info('thread %d end' % i) break iterations = msg['iterate'] run_experiments_bananas_paradigm(args, save_dir, i, iterations, logger, search_space)
def model_consumer(q, gpu, save_dir, total_data_dict, model_data, dataset): file_name = 'log_%s_%d' % ('gpus', gpu) logger = setup_logger(file_name, save_dir, gpu, log_level='DEBUG', filename='%s.txt' % file_name) while True: msg = q.get() if msg == 'done': logger.info('thread %d end' % gpu) break model_idx = msg['idx'] model = model_data[model_idx] if dataset == 'cifar10': val_acc, test_acc, hash_key = model_trainer_cifar10( model, gpu, logger, save_dir) total_data_dict[model_idx] = [val_acc, test_acc, hash_key]
def main_worker(gpu, ngpus_per_node, args, distributed=True): args.gpu = gpu + args.gpu_base if args.multiprocessing_distributed: if args.gpu == args.gpu_base: file_name = 'log_%s_%d' % ('gpus', args.gpu) logger = setup_logger(file_name, args.save_dir, args.gpu, log_level='DEBUG', filename='%s.txt' % file_name) else: logger = DummyLogger() else: file_name = 'log_%s_%d' % ('gpus', args.gpu) logger = setup_logger(file_name, args.save_dir, args.gpu, log_level='DEBUG', filename='%s.txt' % file_name) # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: logger.info("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model logger.info("=> creating model '{}'".format(args.arch)) model = CCLNas(build_model(args.arch, args.with_g_func), args.input_dim, args.moco_dim_fc, args.moco_dim, distributed=distributed, train_samples=args.train_samples, t=args.moco_t, min_negative_size=args.min_negative_size, margin=args.margin) logger.info(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.Adam(model.parameters(), args.lr, betas=(0.0, 0.9), weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): logger.info("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: logger.info("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True if args.search_space == 'nasbench_101': train_dataset = NASBenche101Dataset(model_type='SS_CCL') elif args.search_space == 'nasbench_201': train_dataset = NASBenche201Dataset(model_type='SS_CCL') elif args.search_space == 'darts': train_dataset = DartsDataset(model_type='SS_CCL', arch_path=args.darts_arch_path) else: raise NotImplementedError('This kind nasbench has not implemented.') if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None collator = BatchCollator() train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True, collate_fn=collator) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch center_vec = train_nested(train_loader, model, criterion, optimizer, epoch, args, logger) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_path = os.path.join(args.save_dir, 'checkpoint_{:04d}.pth.tar'.format(epoch)) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'centers': center_vec }, is_best=False, filename=save_path)
def main(args): file_name = 'log_%s_%d' % ('gpus', args.gpu) logger = setup_logger(file_name, args.save_dir, args.gpu, log_level='DEBUG', filename='%s.txt' % file_name) logger.info(args) if args.search_space == 'nasbench_101': with open(nas_bench_101_all_data, 'rb') as fpkl: all_data = pickle.load(fpkl) else: raise NotImplementedError( f'The search space {args.search_space} does not support now!') for k in range(args.trails): seed = random_id_int(4) set_random_seed(seed) s_results_dict = defaultdict(list) k_results_dict = defaultdict(list) logger.info( f'====================== Trails {k} Begin Setting Seed to {seed} ===========================' ) for budget in args.search_budget: train_data, test_data = data.dataset_split_idx(all_data, budget) print( f'budget: {budget}, train data size: {len(train_data)}, test data size: {len(test_data)}' ) for epochs in args.train_iterations: if args.compare_supervised == 'T': logger.info( f'==== predictor type: SUPERVISED, load pretrain model False, ' f'search budget is {budget}. Training epoch is {epochs} ====' ) spearman_corr, kendalltau_corr, duration = predictor_retrain_compare( args, 'SS_RL', train_data, test_data, flag=False, train_epochs=epochs, logger=logger) if math.isnan(spearman_corr): spearman_corr = 0 if math.isnan(kendalltau_corr): kendalltau_corr = 0 s_results_dict[f'supervised#{budget}#{epochs}'].append( spearman_corr) k_results_dict[f'supervised#{budget}#{epochs}'].append( kendalltau_corr) for predictor_type, dir in zip(args.predictor_list, args.load_dir): logger.info( f'==== predictor type: {predictor_type}, load pretrain model True. ' f'Search budget is {budget}. Training epoch is {epochs}. ' f'The model save dir is {dir.split("/")[-1][:-3]} ====' ) spearman_corr, kendalltau_corr, duration = predictor_retrain_compare( args, predictor_type, train_data, test_data, flag=True, load_dir=dir, train_epochs=epochs, logger=logger) if math.isnan(spearman_corr): spearman_corr = 0 if math.isnan(kendalltau_corr): kendalltau_corr = 0 s_results_dict[predictor_type + '#' + str(budget) + '#' + str(epochs)].append(spearman_corr) k_results_dict[predictor_type + '#' + str(budget) + '#' + str(epochs)].append(kendalltau_corr) file_id = random_id(6) save_path = os.path.join( args.save_dir, f'{file_id}_{args.predictor_list[0]}_{args.search_space.split("_")[-1]}_{args.gpu}_{k}.pkl' ) with open(save_path, 'wb') as fp: pickle.dump(s_results_dict, fp) pickle.dump(k_results_dict, fp)
help='name of save directory') args = parser.parse_args() # make save directory save_dir = args.save_dir if not os.path.exists(save_dir): os.mkdir(save_dir) if not os.path.exists(os.path.join(save_dir, 'model_pkl')): os.mkdir(os.path.join(save_dir, 'model_pkl')) if not os.path.exists(os.path.join(save_dir, 'results')): os.mkdir(os.path.join(save_dir, 'results')) if not os.path.exists(os.path.join(save_dir, 'pre_train_models')): os.mkdir(os.path.join(save_dir, 'pre_train_models')) # 2. build architecture training dataset arch_dataset = build_open_search_space_dataset(args.search_space) logger = setup_logger("nasbench_open_%s_cifar10" % args.search_space, args.save_dir, 0, log_level=args.log_level) algo_info = algo_params_open_domain(args.algorithm) algo_info['total_queries'] = args.budget starttime = time.time() multiprocessing.set_start_method('spawn') temp_k = 10 file_name = save_dir + '/results/%s_%d.pkl' % (algo_info['algo_name'], algo_info['total_queries']) data = build_open_algos(algo_info['algo_name'])(search_space=arch_dataset, algo_info=algo_info, logger=logger, gpus=args.gpus, save_dir=save_dir, seed=args.seed) if 'random' in algo_info['algo_name']: results, result_keys = compute_best_test_losses(data, temp_k, total_queries=algo_info['total_queries'])