def train(): child_params = get_child_model_params() controller_params = get_controller_params() corpus = data.Corpus(child_params['data_dir']) eval_batch_size = child_params['eval_batch_size'] train_data = batchify(corpus.train, child_params['batch_size'], child_params['cuda']) val_data = batchify(corpus.valid, eval_batch_size, child_params['cuda']) ntokens = len(corpus.dictionary) if os.path.exists(os.path.join(child_params['model_dir'], 'model.pt')): print("Found model.pt in {}, automatically continue training.".format( os.path.join(child_params['model_dir']))) continue_train_child = True else: continue_train_child = False if continue_train_child: child_model = torch.load( os.path.join(child_params['model_dir'], 'model.pt')) else: child_model = model_search.RNNModelSearch( ntokens, child_params['emsize'], child_params['nhid'], child_params['nhidlast'], child_params['dropout'], child_params['dropouth'], child_params['dropoutx'], child_params['dropouti'], child_params['dropoute'], child_params['drop_path']) if os.path.exists(os.path.join(controller_params['model_dir'], 'model.pt')): print("Found model.pt in {}, automatically continue training.".format( os.path.join(child_params['model_dir']))) continue_train_controller = True else: continue_train_controller = False if continue_train_controller: controller_model = torch.load( os.path.join(controller_params['model_dir'], 'model.pt')) else: controller_model = controller.Controller(controller_params) size = 0 for p in child_model.parameters(): size += p.nelement() logging.info('child model param size: {}'.format(size)) size = 0 for p in controller_model.parameters(): size += p.nelement() logging.info('controller model param size: {}'.format(size)) if args.cuda: if args.single_gpu: parallel_child_model = child_model.cuda() parallel_controller_model = controller_model.cuda() else: parallel_child_model = nn.DataParallel(child_model, dim=1).cuda() parallel_controller_model = nn.DataParallel(controller_model, dim=1).cuda() else: parallel_child_model = child_model parallel_controller_model = controller_model total_params = sum(x.data.nelement() for x in child_model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Child Model total parameters: {}'.format(total_params)) total_params = sum(x.data.nelement() for x in controller_model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Controller Model total parameters: {}'.format(total_params)) # Loop over epochs. if continue_train_child: optimizer_state = torch.load( os.path.join(child_params['model_dir'], 'optimizer.pt')) if 't0' in optimizer_state['param_groups'][0]: child_optimizer = torch.optim.ASGD( child_model.parameters(), lr=child_params['lr'], t0=0, lambd=0., weight_decay=child_params['wdecay']) else: child_optimizer = torch.optim.SGD( child_model.parameters(), lr=child_params['lr'], weight_decay=child_params['wdecay']) child_optimizer.load_state_dict(optimizer_state) child_epoch = torch.load( os.path.join(child_params['model_dir'], 'misc.pt'))['epoch'] - 1 else: child_optimizer = torch.optim.SGD(child_model.parameters(), lr=child_params['lr'], weight_decay=child_params['wdecay']) child_epoch = 0 if continue_train_controller: optimizer_state = torch.load( os.path.join(controller_params['model_dir'], 'optimizer.pt')) controller_optimizer = torch.optim.Adam( controller_model.parameters(), lr=controller_params['lr'], weight_decay=controller_params['weight_decay']) controller_optimizer.load_state_dict(optimizer_state) controller_epoch = torch.load( os.path.join(controller_params['model_dir'], 'misc.pt'))['epoch'] - 1 else: controller_optimizer = torch.optim.Adam( controller_model.parameters(), lr=controller_params['lr'], weight_decay=controller_params['weight_decay']) controller_epoch = 0 eval_every_epochs = child_params['eval_every_epochs'] while True: # Train child model if child_params['arch_pool'] is None: arch_pool = generate_arch( controller_params['num_seed_arch']) #[[arch]] child_params['arch_pool'] = arch_pool child_params['arch'] = None if isinstance(eval_every_epochs, int): child_params['eval_every_epochs'] = eval_every_epochs else: eval_every_epochs = list(map(int, eval_every_epochs)) for index, e in enumerate(eval_every_epochs): if child_epoch < e: child_params['eval_every_epochs'] = e break for e in range(child_params['eval_every_epochs']): child_epoch += 1 model_search.train(train_data, child_model, parallel_child_model, child_optimizer, child_params, child_epoch) if child_epoch % child_params['eval_every_epochs'] == 0: save_checkpoint(child_model, child_optimizer, child_epoch, child_params['model_dir']) logging.info('Saving Model!') if child_epoch >= child_params['train_epochs']: break # Evaluate seed archs valid_accuracy_list = model_search.evaluate(val_data, child_model, parallel_child_model, child_params, eval_batch_size) # Output archs and evaluated error rate old_archs = child_params['arch_pool'] old_archs_perf = valid_accuracy_list old_archs_sorted_indices = np.argsort(old_archs_perf) old_archs = np.array(old_archs)[old_archs_sorted_indices].tolist() old_archs_perf = np.array( old_archs_perf)[old_archs_sorted_indices].tolist() with open( os.path.join(child_params['model_dir'], 'arch_pool.{}'.format(child_epoch)), 'w') as fa: with open( os.path.join(child_params['model_dir'], 'arch_pool.perf.{}'.format(child_epoch)), 'w') as fp: with open(os.path.join(child_params['model_dir'], 'arch_pool'), 'w') as fa_latest: with open( os.path.join(child_params['model_dir'], 'arch_pool.perf'), 'w') as fp_latest: for arch, perf in zip(old_archs, old_archs_perf): arch = ' '.join(map(str, arch)) fa.write('{}\n'.format(arch)) fa_latest.write('{}\n'.format(arch)) fp.write('{}\n'.format(perf)) fp_latest.write('{}\n'.format(perf)) if child_epoch >= child_params['train_epochs']: logging.info('Training finished!') break # Train Encoder-Predictor-Decoder # [[arch]] encoder_input = list(map(lambda x: parse_arch_to_seq(x), old_archs)) encoder_target = normalize_target(old_archs_perf) decoder_target = copy.copy(encoder_input) controller_params['batches_per_epoch'] = math.ceil( len(encoder_input) / controller_params['batch_size']) controller_epoch = controller.train(encoder_input, encoder_target, decoder_target, controller_model, parallel_controller_model, controller_optimizer, controller_params, controller_epoch) # Generate new archs new_archs = [] controller_params['predict_lambda'] = 0 top100_archs = list( map(lambda x: parse_arch_to_seq(x), old_archs[:100])) max_step_size = controller_params['max_step_size'] while len(new_archs) < controller_params['max_new_archs']: controller_params['predict_lambda'] += 1 new_arch = controller.infer(top100_archs, controller_model, parallel_controller_model, controller_params) for arch in new_arch: if arch not in encoder_input and arch not in new_archs: new_archs.append(arch) if len(new_archs) >= controller_params['max_new_archs']: break logging.info('{} new archs generated now'.format(len(new_archs))) if controller_params['predict_lambda'] >= max_step_size: break #[[arch]] new_archs = list(map(lambda x: parse_seq_to_arch(x), new_archs)) #[[arch]] num_new_archs = len(new_archs) logging.info("Generate {} new archs".format(num_new_archs)) random_new_archs = generate_arch(50) new_arch_pool = old_archs[:len(old_archs) - num_new_archs - 50] + new_archs + random_new_archs logging.info("Totally {} archs now to train".format( len(new_arch_pool))) child_params['arch_pool'] = new_arch_pool with open(os.path.join(child_params['model_dir'], 'arch_pool'), 'w') as f: for arch in new_arch_pool: arch = ' '.join(map(str, arch)) f.write('{}\n'.format(arch))
def worker(gpu, ngpus_per_node, config_in): # init config = copy.deepcopy(config_in) args = config jobid = os.environ["SLURM_JOBID"] procid = int(os.environ["SLURM_PROCID"]) config.gpu = gpu if config.gpu is not None: writer_name = "tb.{}-{:d}-{:d}".format(jobid, procid, gpu) logger_name = "{}.{}-{:d}-{:d}.search.log".format(config.name, jobid, procid, gpu) model_name = "{}-{:d}-{:d}-model.pt".format(jobid, procid, gpu) optimizer_name = "{}-{:d}-{:d}-optimizer.pt".format(jobid, procid, gpu) msic_name = "{}-{:d}-{:d}-misc.pt".format(jobid, procid, gpu) ck_name = "{}-{:d}-{:d}".format(jobid, procid, gpu) else: writer_name = "tb.{}-{:d}-all".format(jobid, procid) logger_name = "{}.{}-{:d}-all.search.log".format(config.name, jobid, procid) model_name = "{}-{:d}-all-model.pt".format(jobid, procid) optimizer_name = "{}-{:d}-all-optimizer.pt".format(jobid, procid) msic_name = "{}-{:d}-all-misc.pt".format(jobid, procid) ck_name = "{}-{:d}-all".format(jobid, procid) writer = SummaryWriter(log_dir=os.path.join(config.path, writer_name)) # writer.add_text('config', config.as_markdown(), 0) logger = get_logger(os.path.join(config.path, logger_name)) # get cuda device device = torch.device('cuda', gpu) # ============================== begin ============================== logger.info("Logger is set - training start") logger.info('Args: {}'.format(args)) if config.dist_url == "env://" and config.rank == -1: config.rank = int(os.environ["RANK"]) if config.mp_dist: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes config.rank = config.rank * ngpus_per_node + gpu # print('back:{}, dist_url:{}, world_size:{}, rank:{}'.format(config.dist_backend, config.dist_url, config.world_size, config.rank)) dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url, world_size=config.world_size, rank=config.rank) # get data corpus = data.Corpus(args.data) eval_batch_size = 10 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) search_data = batchify(corpus.valid, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) # split data ( with respect to GPU_id) def split_set(set_in): per_set_length = set_in.size(0) // config.world_size set_out = set_in[per_set_length*config.rank + 0: per_set_length*config.rank + per_set_length] return set_out train_data = split_set(train_data).to(device) search_data = split_set(search_data).to(device) val_data = split_set(val_data).to(device) test_data = split_set(test_data).to(device) if config.dist_privacy: logger.info("PRIVACY ENGINE ON") # build model ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, model_name)) else: model = model_search.RNNModelSearch(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute) # make model distributed if config.gpu is not None: torch.cuda.set_device(config.gpu) # model = model.to(device) model.cuda(config.gpu) # When using a single GPU per process and per DistributedDataParallel, we need to divide # the batch size ourselves based on the total number of GPUs we have # config.batch_size = int(config.batch_size / ngpus_per_node) config.workers = int((config.workers + ngpus_per_node - 1) / ngpus_per_node) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.rank]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu]) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) architect = Architect(model.module, args) total_params = sum(x.data.nelement() for x in model.module.parameters()) logger.info('Model total parameters: {}'.format(total_params)) # Loop over epochs. lr = args.lr best_val_loss = [] stored_loss = 100000000 if args.continue_train: optimizer_state = torch.load(os.path.join(args.save, optimizer_name)) if 't0' in optimizer_state['param_groups'][0]: optimizer = torch.optim.ASGD(model.module.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) else: optimizer = torch.optim.SGD(model.module.parameters(), lr=args.lr, weight_decay=args.wdecay) optimizer.load_state_dict(optimizer_state) else: optimizer = torch.optim.SGD(model.module.parameters(), lr=args.lr, weight_decay=args.wdecay) for epoch in range(1, args.epochs+1): epoch_start_time = time.time() # train() train(model, architect, epoch, corpus, train_data, search_data, optimizer, device, logger, writer, args) val_loss = evaluate(model, corpus, args, val_data, eval_batch_size) logger.info('-' * 89) logger.info('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) logger.info('-' * 89) writer.add_scalar('val/loss', val_loss, epoch) writer.add_scalar('val/ppl', math.exp(val_loss), epoch) if val_loss < stored_loss: save_checkpoint(model, optimizer, epoch, args.save, dist_name=ck_name) logger.info('Saving Normal!') stored_loss = val_loss best_val_loss.append(val_loss) test_loss = evaluate(model, corpus, args, test_data, test_batch_size) logger.info('=' * 89) logger.info('| End of training & Testing | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) logger.info('=' * 89)
train_data = batchify(corpus.train, args.batch_size) search_data = batchify(corpus.valid, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, test_batch_size) ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, "model.pt")) else: model = model.RNNModelSearch( ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, ) size = 0 for p in model.parameters(): size += p.nelement() logging.info("param size: {}".format(size)) logging.info("initial genotype:") logging.info(model.genotype()) if torch.cuda.is_available():
# val_data = batchify(corpus.valid, eval_batch_size, args) # test_data = batchify(corpus.test, test_batch_size, args) eval_batch_size = 10 test_batch_size = 1 # ntokens = len(corpus.dictionary) ntokens = len(vocab.id2word) if args.continue_train: model = torch.load(os.path.join(args.save, 'model.pt')) else: model = model.RNNModelSearch(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, args.ner_dim, args.pos_dim, args.token_emb_path, len(constant.LABEL_TO_ID), args.pe_dim) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.genotype()) if args.cuda: if args.single_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda()