示例#1
0
def train(args):
	local_export_root, remote_export_root, communicator = setup_train(args, MACHINE_IS_HOST)
	assert (communicator is None and MACHINE_IS_HOST) or (communicator is not None and not MACHINE_IS_HOST)
	if communicator:
		communicator.close()  # close station because it might lose connection during long training
	train_loader, val_loader, test_loader = dataloader_factory(args)
	model = model_factory(args)
	trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, local_export_root)
	status_file = os.path.join(local_export_root, 'status.txt')
	error_log_file = os.path.join(local_export_root, 'error_log.txt')
	open(status_file, 'w').write(STATUS_RUNNING)
	try:
		trainer.train()
		open(status_file, 'w').write(STATUS_FINISHED)
		if not MACHINE_IS_HOST and args.experiment_group != 'test':
			communicator = Communicator(HOST, PORT, USERNAME, PASSWORD)
			communicator.upload_dir(local_export_root, remote_export_root)
			communicator.close()
	except Exception as err:
		# recover
		if args.experiment_group == 'test':
			raise
		if not os.path.exists(os.path.join(local_export_root, 'tables', 'val_log.csv')):
			print('Removing empty local export root')
			shutil.rmtree(local_export_root)
			raise
		open(status_file, 'w').write(STATUS_RECOVERY)
		open(error_log_file, 'w').write(str(err))
		if not MACHINE_IS_HOST and args.experiment_group != 'test':
			print('Uploading recovery file')
			communicator = Communicator(HOST, PORT, USERNAME, PASSWORD)
			communicator.upload_dir(local_export_root, remote_export_root)
			communicator.close()
		raise
示例#2
0
def train_test_net(run, user_options):
  """Train and save a network accoring to user options
  
  Args
  run (int): the current independent run (used in filenames)
  user_options (argparser) : user specified options
  """

  # get logger
  logging.getLogger('train')

  #initialize model
  net = models.model_factory(user_options.arch, dataset=user_options.dataset, init=user_options.init)
  
  if torch.cuda.device_count() > 1:
    logger.info("Running on {} GPUs".format(torch.cuda.device_count()))
    net = NamedDataParallel(net)
  
  # move net to device
  net = net.to(device=device)
  
  # get data loader for the specified dataset
  train_loader, test_loader = data_loaders.load_dataset(user_options.dataset, user_options.dataset_path, user_options.noisy, user_options.batch_size)

  # define loss
  criterion = load_criterion(user_options)
  criterion = criterion.to(device)
  
  # resume training from snapshot if specified
  start_epoch = 0
  if os.path.isfile(user_options.resume_from):
    # resume training given state dictionary
    optimizer, scheduler = load_optimizer(user_options, net)
    net, optimizer, scheduler, start_epoch = snapshot.load_snapshot(net, optimizer, scheduler, user_options.resume_from, device)
    start_epoch = start_epoch + 1
  else:
    # define optimizer
    optimizer, scheduler = load_optimizer(user_options, net)

  # print model configuration
  logger.info("Running trial {} of {}".format(run+1, user_options.runs))
  utils.print_model_config(user_options, start_epoch)
  
  if start_epoch == 0: 
    filename = net.__name__ + '_' + str(start_epoch) + '_' + str(user_options.init) + '.pt'
    logger.info("Saving model initialization to {}".format(filename))
    snapshot.save_model(net, filename, snapshot_dirname)

  # train the model
  net, converged = train(net, user_options.epochs, train_loader, optimizer, criterion, scheduler, device, start_epoch, snapshot_every = user_options.snapshot_every, test_loader = test_loader, kill_plateaus = user_options.kill_plateaus, init_scheme=user_options.init)
  
  if test_loader is not None:
    val_loss, accuracy = scores.test(net, test_loader, criterion, device)
    utils.print_val_loss(user_options.epochs, val_loss, accuracy)
    net = net.train()

  # save final model
  if converged:
    filename = net.__name__ + '_' + str(user_options.epochs) + '_' + user_options.init + '.pt'
    snapshot.save_model(net, filename, snapshot_dirname)
示例#3
0
def train():
    from models import model_factory
    from dataloaders import dataloader_factory
    from trainers import trainer_factory
    from pruners import pruner_factory
    from utils import *
    from utils import scatterplot

    from torch.utils.tensorboard import SummaryWriter
    from torchvision import datasets, transforms

    export_root = setup_train(args)
    test_result_root = 'experiments/testresults'
    test_result_title = export_root[12:]
    test_result_title += '.txt'
    model = model_factory(args)
    train_loader, val_loader, test_loader = dataloader_factory(args)
    pruner = pruner_factory(args, model)
    trainer = trainer_factory(args, model, train_loader, val_loader,
                              test_loader, export_root, pruner)
    #load_pretrained_weights(model, './experiments/ml-1m.pth')
    trainer.train()
    trainer.test()

    if args.prune:
        trainer.prune()
    #pruner.print_mask(model)
    #pruner.print_percentage(model)
    i = 0
    test_result = trainer.test()

    save_test_result(export_root, test_result)
    save_test_result(test_result_root, test_result, test_result_title)
    print(test_result_root)
示例#4
0
def train():
    export_root = setup_train(args)
    train_loader, val_loader, test_loader = dataloader_factory(args)
    model = model_factory(args)
    trainer = trainer_factory(args, model, train_loader, val_loader,
                              test_loader, export_root)
    trainer.train()
示例#5
0
def load_model(model_name, dataset, path, device):
  """Load a model from file for inference.
  
  Keyword arguments:
  model_name (str) -- name of the model architecture
  dataset (int) -- dataset (used to infer input dimensionality)
  path (str) -- path to the saved model
  device (torch.device) -- where to move the model after loading
  """
  
  logger = logging.getLogger('train')
  
  net = models.model_factory(model_name, dataset)

  # load parameters
  logger.info('Loading model {} from {}'.format(net.__name__, path))
  net.load_state_dict(torch.load(path), map_location=device)

  # move to device
  net = net.to(device = device)

  # set model to inference mode
  net = net.eval()

  return net
 def __init__(self, *args, **kwargs):
     super().__init__()
     self.model, self.layers = model_factory(kwargs['model_name'])
     self.init_loss()
     for k,v in kwargs.items(): # maybe iterate them, but too much repetition
         setattr(self, k, v)
     self.hparams = kwargs
     self.is_ddp = self.distributed_backend in ('ddp', 'ddp2')
     self.num_workers = self.is_ddp and self.num_workers // self.gpus or self.num_workers
示例#7
0
def train(model_args):
    export_root = setup_train(model_args)
    train_loader, val_loader, test_loader = dataloader_factory(model_args)
    model = model_factory(model_args)
    trainer = trainer_factory(model_args, model, train_loader, val_loader,
                              test_loader, export_root)
    if model_args.mode == 'train':
        trainer.train()
    trainer.test()
 def test_with_factory(self):
     inputs = torch.randn(1, 3, 513, 513)
     model = model_factory(edict({
         'seg_model': 'deeplab_v3',
         'backbone': 'xception'
     }))
     model.eval()
     with torch.no_grad():
         output = model(inputs)
     self.assertTupleEqual((1, 25, 513, 513), output.size())
def train():
    export_root = setup_train(args)
    train_loader, val_loader, test_loader = dataloader_factory(args)
    model = model_factory(args)
    trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, export_root)
    trainer.train()

    test_model = (input('Test model with test dataset? y/[n]: ') == 'y')
    if test_model:
        trainer.test()
示例#10
0
def validate(args, mode='val'):
	local_export_root, remote_export_root, communicator = setup_train(args, MACHINE_IS_HOST)
	if communicator:
		communicator.close()
	train_loader, val_loader, test_loader = dataloader_factory(args)
	model = model_factory(args)
	if args.pretrained_weights is not None:
		model.load(args.pretrained_weights)
	trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, local_export_root)
	trainer.just_validate(mode)
示例#11
0
def main(args):
    export_root, args = setup_experiments(args)
    device = args.device
    model_checkpoint_path = os.path.join(export_root, 'models')

    dataloaders = dataloaders_factory(args)
    model = model_factory(args)

    writer = SummaryWriter(os.path.join(export_root, 'logs'))

    train_loggers = [
        MetricGraphPrinter(writer,
                           key='ce_loss',
                           graph_name='ce_loss',
                           group_name='Train'),
        MetricGraphPrinter(writer,
                           key='epoch',
                           graph_name='Epoch',
                           group_name='Train')
    ]
    val_loggers = [
        MetricGraphPrinter(writer,
                           key='mean_iou',
                           graph_name='mIOU',
                           group_name='Validation'),
        MetricGraphPrinter(writer,
                           key='acc',
                           graph_name='Accuracy',
                           group_name='Validation'),
        RecentModelLogger(model_checkpoint_path),
        BestModelLogger(model_checkpoint_path, metric_key='mean_iou'),
    ]

    # criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_LABEL, weight=torch.Tensor(CLASS_WEIGHT).to(device))
    criterion = nn.CrossEntropyLoss()
    optimizer = create_optimizer(model, args)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=args.decay_step,
                                          gamma=args.gamma)

    trainer = Trainer(model,
                      dataloaders,
                      optimizer,
                      criterion,
                      args.epoch,
                      args,
                      num_classes=42,
                      log_period_as_iter=args.log_period_as_iter,
                      train_loggers=train_loggers,
                      val_loggers=val_loggers,
                      lr_scheduler=scheduler,
                      device=device)
    trainer.train()
    writer.close()
示例#12
0
def main():
    args = parse_args()
    model = model_factory(args)
    word_id_lst, post_lsts, _, _, _, pos_lsts = read_data(args.input)
    freqs_lst = []

    odir = Path(args.output).parent
    if not odir.is_dir():
        os.mkdir(odir)
    model.load()
    for post, pos_tags in zip(post_lsts, pos_lsts):
        freqs_lst.append(model.predict(post, pos_tags))
    write_results(word_id_lst, post_lsts, freqs_lst, args.output)
    print("Output file created successfully")
示例#13
0
文件: run.py 项目: GeJulia/A2SNN
def test(args, device):
    print(args)
    model = model_factory(args['dataset'], args['training_type'],
                          args['var_type'], args['feature_dim'],
                          args['num_classes'])
    model.to(device)
    model.load(os.path.join(args['output_path']['models'], 'ckpt_best'))
    model.eval()
    test_loader = get_data_loader(args['dataset'],
                                  args['batch_size'],
                                  False,
                                  shuffle=False,
                                  drop_last=False)
    attack_names = ['FGSM', 'PGD']  # 'BIM', 'C&W', 'Few-Pixel'
    print('Adversarial testing.')
    for idx, attack in enumerate(attack_names):
        print('Attack: {}'.format(attack))
        if attack == 'Few-Pixel':
            if args['dataset'] == 'cifar10':
                preproc = {
                    'mean': [0.4914, 0.4822, 0.4465],
                    'std': [0.2023, 0.1994, 0.2010]
                }
            else:
                raise NotImplementedError(
                    'Only CIFAR-10 supported for the one-pixel attack.')
            one_pixel_attack(model,
                             test_loader,
                             preproc,
                             device,
                             pixels=1,
                             targeted=False,
                             maxiter=1000,
                             popsize=400,
                             verbose=False)
        else:
            eps_names = attack_to_dataset_config[attack][
                args['dataset']]['eps_names']
            eps_values = attack_to_dataset_config[attack][
                args['dataset']]['eps_values']
            robust_accuracy = test_attack(model, test_loader, attack,
                                          eps_values, args, device)
            for eps_name, eps_value, accuracy in zip(eps_names, eps_values,
                                                     robust_accuracy):
                print('Attack Strength: {}, Accuracy: {:.3f}'.format(
                    eps_name, accuracy.item()))
    print('Finished testing.')
示例#14
0
def train():
    export_root = setup_train(args)
    model = model_factory(args)
    train_loader, val_loader, test_loader = dataloader_factory(args)
    pruner = pruner_factory(args, model)
    trainer = trainer_factory(args, model, train_loader, val_loader,
                              test_loader, export_root, pruner)
    #trainer.train()
    print("Model's state_dict:")
    for param_tensor in model.bert.state_dict():
        print(param_tensor, "\t", model.bert.state_dict()[param_tensor].size())

    # Print optimizer's state_dict
    print("Optimizer's state_dict:")
    for var_name in trainer.optimizer.state_dict():
        print(var_name, "\t", trainer.optimizer.state_dict()[var_name])
    torch.save(model, './initmodel.pth')
示例#15
0
def evaluate():
    export_root = setup_train(args)
    meta, train_loader, val_loader, test_loader = dataloader_factory(args)
    model = model_factory(args, meta)
    trainer = trainer_factory(args, model, train_loader, val_loader,
                              test_loader, export_root)

    path = args.eval_model_path
    load_pretrained_weights(model, path)

    average_meter_set = AverageMeterSet()
    for batch in test_loader:
        with torch.no_grad():
            batch = [x.to(trainer.device) for x in batch]
            metrics = trainer.calculate_metrics(batch)
            for k, v in metrics.items():
                average_meter_set.update(k, v)
    print(average_meter_set.averages())
示例#16
0
文件: run.py 项目: GeJulia/A2SNN
def train(args, device):
    print(args)
    os.makedirs(args['output_path']['stats'], exist_ok=True)
    os.makedirs(args['output_path']['models'], exist_ok=True)
    train_loader = get_data_loader(args['dataset'],
                                   args['batch_size'],
                                   train=True,
                                   shuffle=True,
                                   drop_last=True)
    test_loader = get_data_loader(args['dataset'],
                                  args['batch_size'],
                                  train=False,
                                  shuffle=False,
                                  drop_last=False)
    model = model_factory(args['dataset'], args['training_type'],
                          args['var_type'], args['feature_dim'],
                          args['num_classes'])
    model.to(device)
    if args['pretrained'] is not None:
        if args['pretrained'] not in ('ckpt_best', 'ckpt_last', 'ckpt_robust'):
            raise ValueError(
                'Pre-trained model name must be: [ckpt_best|ckpt_last|ckpt_robust]'
            )
        model.load(
            os.path.join(args['output_path']['models'], args['pretrained']))
    if args['training_type'] == 'vanilla':
        print('Vanilla training.')
        train_vanilla(model, train_loader, test_loader, args, device=device)
    elif args['training_type'] == 'stochastic':
        print('Stochastic training.')
        train_stochastic(model, train_loader, test_loader, args, device=device)
    elif args['training_type'] == 'stochastic+adversarial':
        print('Adversarial stochastic training.')
        train_stochastic_adversarial(model,
                                     train_loader,
                                     test_loader,
                                     args,
                                     device=device)
    else:
        raise NotImplementedError(
            'Training "{}" not implemented. Supported: [vanilla|stochastic|stochastic+adversarial].'
            .format(args['training_type']))
    print('Finished training.')
示例#17
0
def main(args):
    export_root, args = setup_experiments(args)
    device = args.device
    model_checkpoint_path = os.path.join(export_root, 'models')

    train_dataset = dataset_factory(args.train_transform_type, is_train=True)
    val_dataset = dataset_factory(args.val_transform_type, is_train=False)

    dataloaders = dataloaders_factory(train_dataset, val_dataset,
                                      args.batch_size, args.test)
    model = model_factory(args)

    writer = SummaryWriter(os.path.join(export_root, 'logs'))

    train_loggers = [
        MetricGraphPrinter(writer,
                           key='loss',
                           graph_name='loss',
                           group_name='Train'),
        MetricGraphPrinter(writer,
                           key='epoch',
                           graph_name='Epoch',
                           group_name='Train'),
    ]
    val_loggers = [
        MetricGraphPrinter(writer,
                           key='mean_iou',
                           graph_name='mIOU',
                           group_name='Validation'),
        MetricGraphPrinter(writer,
                           key='acc',
                           graph_name='Accuracy',
                           group_name='Validation'),
        RecentModelLogger(model_checkpoint_path),
        BestModelLogger(model_checkpoint_path, metric_key='mean_iou'),
        ImagePrinter(writer, train_dataset, log_prefix='train'),
        ImagePrinter(writer, val_dataset, log_prefix='val')
    ]

    criterion = create_criterion(args)
    optimizer = create_optimizer(model, args)

    if args.pretrained_weights:
        load_pretrained_weights(args, model)

    if args.resume_training:
        setup_to_resume(args, model, optimizer)

    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=args.decay_step,
                                          gamma=args.gamma)
    trainer = Trainer(model,
                      dataloaders,
                      optimizer,
                      criterion,
                      args.epoch,
                      args,
                      num_classes=args.classes,
                      log_period_as_iter=args.log_period_as_iter,
                      train_loggers=train_loggers,
                      val_loggers=val_loggers,
                      lr_scheduler=scheduler,
                      device=device)
    trainer.train()
    writer.close()
示例#18
0
#preprocessing step
steps = ['encode', 'add', 'regu']
for k in steps:
    dataset.preprocess(type=k)
if log:
    dataset.y = dataset.y.apply(lambda x: np.log(x + 1))

# get data
x, y, test, test_index = dataset.get_data()

# set model parameters
# a little bit complicated because it is modified from my another asgn
# parameters in model params
#i f not exist, then grid search
model = ['mlp' for i in range(10)]  # ten mlps
model_stack = models.model_factory()
for k in model:
    model_stack.add_model(k)
model_stack.set_parameters(x, y)

# model fusion part, use stacking

mods = model_stack.get_models()
sclf = StackingRegressor(regressors=mods,
                         use_features_in_secondary=True,
                         meta_regressor=mods[0],
                         verbose=0)
sclf.fit(x, y)
result = sclf.predict(test)

# map back the prediction
示例#19
0
    start = time.time()

    if not os.path.isfile('vocab_%s.pkl' % args.exp_name):
        print("Building vocabulary")
        text_field.build_vocab(train_dataset, val_dataset, min_freq=5)
        pickle.dump(text_field.vocab, open('vocab_%s.pkl' % args.exp_name,
                                           'wb'))
    else:
        text_field.vocab = pickle.load(
            open('vocab_%s.pkl' % args.exp_name, 'rb'))

    print('build vocab time')
    print(time.time() - start)
    start = time.time()
    # Model and dataloaders
    Transformer, TransformerEncoder, TransformerDecoderLayer, ScaledDotProductAttention = model_factory(
        args)
    encoder = TransformerEncoder(args.n_layer,
                                 0,
                                 attention_module=ScaledDotProductAttention,
                                 d_in=args.dim_feats,
                                 d_k=args.d_k,
                                 d_v=args.d_v,
                                 h=args.head,
                                 d_model=args.d_model)
    decoder = TransformerDecoderLayer(len(text_field.vocab),
                                      54,
                                      args.n_layer,
                                      text_field.vocab.stoi['<pad>'],
                                      d_k=args.d_k,
                                      d_v=args.d_v,
                                      h=args.head,
示例#20
0
def student_train_test(user_options):
  """Train student network by knowledge distillation
  
  Args
  run (int): the current independent run (used filenames)
  user_options (argparser) : user specified options
  """
  # get logger
  logging.getLogger('train')

  # load teacher model
  teacher = models.model_factory(user_options.arch, dataset=user_options.dataset, init=user_options.init)
  
  if torch.cuda.device_count() > 1:
    logger.info("Running teacher network on {} GPUs".format(torch.cuda.device_count()))
    teacher = NamedDataParallel(teacher)
    tdevice = device
  else:
    tdevice = torch.device('cpu')
  
  # move net to device
  teacher = teacher.to(device=tdevice)
  
  # load teacher network from file
  if os.path.isfile(user_options.resume_from):
    teacher, _, _, _ = snapshot.load_snapshot(teacher, None, None, user_options.resume_from, tdevice)
    teacher = teacher.eval()
  else:
    raise ValueError('Missing teacher model definition. Specify it with --resume-from [FILENAME]')
  
  # get data loader for the specified dataset
  train_loader, test_loader = data_loaders.load_dataset(user_options.dataset, user_options.dataset_path, user_options.noisy, user_options.batch_size)
  
  # load student
  student = models.student_factory(user_options.arch, user_options.dataset, init=user_options.init)
  
  if torch.cuda.device_count() > 1:
    logger.info("Running student network on {} GPUs".format(torch.cuda.device_count()))
    student = NamedDataParallel(student)
    
  student = student.to(device=device)

  # load optimizer, scheduler
  optimizer, scheduler = load_optimizer(user_options, student)

  # define loss
  criterion = load_criterion(user_options)

  # print model configuration
  start_epoch = 0
  utils.print_student_config(user_options)
  
  # save model at initialization
  teacher_name = os.path.basename(user_options.resume_from)
  teacher_name = os.path.splitext(teacher_name)[0] # remove file extension
  teacher_name = teacher_name.split('_')[0]
  filename = 'Student_' + teacher_name + '_' + str(start_epoch) + '.pt'
  snapshot.save_model(student, filename, snapshot_dirname)

  # train the model
  student, converged = distill(student, teacher, user_options.epochs, train_loader, optimizer, criterion, scheduler, tdevice, device, start_epoch, snapshot_every = user_options.epochs, kill_plateaus = user_options.kill_plateaus)
  
  if test_loader is not None:
    test_criterion = nn.CrossEntropyLoss()
    val_loss, accuracy = scores.test(student, test_loader, test_criterion, device)
    utils.print_val_loss(user_options.epochs, val_loss, accuracy)

  # save final model
  if converged:
    teacher_name = os.path.basename(user_options.resume_from)
    teacher_name = os.path.splitext(teacher_name)[0] # remove file extension
    filename = 'Student_' + teacher_name + '.pt'
    snapshot.save_model(student, filename, snapshot_dirname)
示例#21
0
        description='Test for Cifar10 w/ or w/o trt')
    parser.add_argument('--gpu',
                        '-p',
                        action='store_true',
                        help='Trained on GPU',
                        default='true')
    parser.add_argument('--model',
                        '-m',
                        default='alexnet',
                        type=str,
                        help='Name of Network')

    args = parser.parse_args()

    model_name = args.model
    model = model_factory(model_name)
    print("Testing model: %s" % model_name)

    if args.gpu and torch.cuda.is_available():
        # CuDNN must be enabled for FP16 training.
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True
        model = model.cuda()
    else:
        print("-p is a must when executing this script. Exiting...")
        exit()

    model.load_state_dict(torch.load('./weights/' + model_name + '.pt')['net'])

    accbefore = torch.load('./weights/' + model_name + '.pt')['acc']