def __init__(self): self.seed = 1234 self.EPOCH = 100 self.LEARNING_RATE = 1e-3 self.WEIGHT_DECAY = 1e-5 self.BATCH_SIZE_TRAIN = 2 self.BATCH_SIZE_TEST = 2 self.HIDDEN_SIZE = 200 if platform.system() == "Darwin": self.DATA_ROOT_PATH = "/Users/didi/Desktop/Coldog/Github/Spatial-RNN/MSCOCO/" self.MODEL_PATH = "/Users/didi/Desktop/Coldog/Github/Spatial-RNN/model/" elif platform.system() == "Linux": self.DATA_ROOT_PATH = "/home/amax/data/val2017/" # self.DATA_ROOT_PATH = "/home/amax/data/val2017_tiny/" # self.DATA_ROOT_PATH = "/data/data_tune/" self.MODEL_PATH = self.DATA_ROOT_PATH + "model/" self.MODEL_NAME = "model.pkl" # cuda option if torch.cuda.is_available(): self.device = "cuda" try: self.device_index = os.environ[ "CUDA_VISIBLE_DEVICES"] # 限定device except: self.device_index = ','.join( get_free_gpu()) # 没有限定device, 取所有空间device. else: self.device = "cpu" self.device_index = "-1" # -1表示没有gpu device
def main(): print('Start load data...') source_train_loader = x2ct_dataset.x_train_loader_train target_train_loader = x2ct_dataset.ct_train_loader_train if torch.cuda.is_available(): get_free_gpu() print('Running GPU : {}'.format(torch.cuda.current_device())) encoder = x2ct_model.Extractor().cuda() classifier = x2ct_model.Classifier().cuda() discriminator = x2ct_model.Discriminator().cuda() train.dann(encoder, classifier, discriminator, source_train_loader, target_train_loader, save_name) else: print("There is no GPU -_-!")
def main(): source_train_loader = mnist.mnist_train_loader target_train_loader = mnistm.mnistm_train_loader if torch.cuda.is_available(): get_free_gpu() print('Running GPU : {}'.format(torch.cuda.current_device())) encoder = model.Extractor().cuda() classifier = model.Classifier().cuda() discriminator = model.Discriminator().cuda() train.source_only(encoder, classifier, discriminator, source_train_loader, target_train_loader, save_name) train.dann(encoder, classifier, discriminator, source_train_loader, target_train_loader, save_name) else: print("There is no GPU -_-!")
set_random_seed() # Arguments if True: BATCH_SIZE = args.bs NUM_NEIGHBORS = args.n_degree NUM_NEG = 1 NUM_EPOCH = args.n_epoch NUM_HEADS = args.n_head DROP_OUT = args.drop_out # GPU = get_free_gpu() # GPU = args.gpu if args.gpu >= 0: GPU = args.gpu else: GPU = get_free_gpu() UNIFORM = args.uniform USE_TIME = args.time AGG_METHOD = args.agg_method ATTN_MODE = args.attn_mode SEQ_LEN = NUM_NEIGHBORS DATA = args.data NUM_LAYER = args.n_layer LEARNING_RATE = args.lr NODE_DIM = args.node_dim TIME_DIM = args.time_dim MODEL_SAVE_PATH = f'./saved_models/{args.prefix}-{args.agg_method}-{args.attn_mode}-{args.data}.pth' def get_checkpoint_path(epoch): return f'./saved_checkpoints/{args.prefix}-{args.agg_method}-{args.attn_mode}-{args.data}-{epoch}.pth'
from echos import DataMaster import pickle5 as pickle dir_path = os.path.dirname(os.path.realpath(__file__)) if __name__ == '__main__': # torch.autograd.set_detect_anomaly(True) # current time for file names date_time = time.strftime("%Y%m%d-%H%M%S") print("Time:", date_time) # check if gpu is available if torch.cuda.is_available(): device = 'cuda:' + str(get_free_gpu()) else: device = 'cpu' print('INFO: Start on device %s' % device) print(os.path.join(dir_path, '../configuration/NNMF.ini')) parser = argparse.ArgumentParser() parser.add_argument('-y', '--config', default=os.path.join(dir_path, 'configuration/NNMF.ini')) args = parser.parse_args() conf = args.config config = ConfigParserEcho() config.read(conf) fact_type = config['Parameters']['fact_type']
def train(args): # Init wandb run = wandb.init(name=args.save_dir[len('../runs/'):], config=args, project='sign-language-recognition') # Create directory for model checkpoints and log if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # Save args with open(os.path.join(args.save_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=2) # Logger logger = create_logger(args.save_dir) # Set gpu if torch.cuda.is_available(): i = get_free_gpu() device = get_device(gpu=i) else: device = 'cpu' logger.info('using device: {}'.format(device)) # Prepare early stop stopped = False best_epoch = 0 best_loss = torch.Tensor([float('Inf')]) # Data if args.freeze_vgg: real_batch_size = 3 else: real_batch_size = 2 # can't fit more into gpu memory json_file = os.path.join(args.data_path, 'WLASL_v0.3.json') videos_folder = os.path.join(args.data_path, 'videos') keypoints_folder = os.path.join(args.data_path, 'keypoints') train_transforms = transforms.Compose([videotransforms.RandomCrop(224)]) val_transforms = train_transforms # Debug data if args.debug_dataset: train_dataset = WLASL(json_file=json_file, videos_folder=videos_folder, keypoints_folder=keypoints_folder, transforms=train_transforms, split='train', subset=args.subset) train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=real_batch_size, sampler=DebugSampler( args.debug_dataset, len(train_dataset))) val_dl = train_dl else: train_dataset = WLASL(json_file=json_file, videos_folder=videos_folder, keypoints_folder=keypoints_folder, transforms=train_transforms, split='train', subset=args.subset) train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=real_batch_size, shuffle=True) val_dataset = WLASL(json_file=json_file, videos_folder=videos_folder, keypoints_folder=keypoints_folder, transforms=val_transforms, split='val', subset=args.subset) val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=real_batch_size, shuffle=True) logger.info('data loaded') # Model, loss, optimizer m = Conv2dRNN(args).to(device) optimizer = torch.optim.Adam(m.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() # Resume train start_epoch = 0 if args.resume_train: checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint.pt.tar'), map_location=torch.device('cpu')) best_epoch = checkpoint['epoch'] m.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) m = m.to(device) best_loss = checkpoint['best_val_loss'] start_epoch = best_epoch + 1 # Change learning rate for g in optimizer.param_groups: g['lr'] = args.lr logger.info( 'Resuming training from epoch {} with best loss {:.4f}'.format( start_epoch, best_loss)) # learning rate scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.lr_schedule_factor, patience=args.lr_schedule_patience, threshold=args.lr_schedule_threshold) # Watch model with wandb run.watch(m, log='all', log_freq=5) # Print args logger.info('using args: \n' + json.dumps(vars(args), sort_keys=True, indent=2)) # Train loop for t in range(args.n_epochs): t += start_epoch # Train losses = AverageMeter() batch_time = AverageMeter() m.train() start_t = time.time() for i, batch in enumerate(train_dl): # Run the forward pass multiple times and accumulate gradient (to be able to use large batch size) X = batch['X'].to(device) label = batch['label'].to(device) # [per frame logits, mean of all frames logits] logits = m(X) # Create label for each logit label = torch.cat([l.repeat(logits.shape[1], 1) for l in label], dim=0) # Squeeze time sequence and batch into one dimension logits = logits.reshape(logits.shape[0] * logits.shape[1], logits.shape[2]) loss = criterion(logits, label.squeeze()) loss.backward() losses.update(loss.item()) if (i % (args.batch_size // real_batch_size)) == 0: # Optimize with accumulated gradient optimizer.step() optimizer.zero_grad() batch_time.update(time.time() - start_t) start_t = time.time() train_loss = losses.avg # Validate with torch.no_grad(): top1 = AverageMeter() top5 = AverageMeter() top10 = AverageMeter() losses = AverageMeter() m.eval() for batch in val_dl: X = batch['X'].to(device) label = batch['label'].to(device) # [per frame logits, mean of all frames logits] logits = m(X) # Create label for each logit label = torch.cat( [l.repeat(logits.shape[1], 1) for l in label], dim=0) # Squeeze time sequence and batch into one dimension logits = logits.reshape(logits.shape[0] * logits.shape[1], logits.shape[2]) losses.update(criterion(logits, label.squeeze()).item()) # Update metrics acc1, acc5, acc10 = topk_accuracy(logits, label, topk=(1, 5, 10)) top1.update(acc1.item()) top5.update(acc5.item()) top10.update(acc10.item()) val_loss = losses.avg # Save best model if val_loss < best_loss: best_loss, best_epoch = val_loss, t save_best(args, t, m, optimizer, best_loss) # Check early stop if t >= best_epoch + args.early_stop: logger.info('EARLY STOP') break # Log info logger.info( 'epoch: {} train loss: {:.4f} val loss: {:.4f} top1acc {:.4f} top5acc {:.4f} top10acc {:.4f} lr: {:.2e} time per batch {:.1f} s' .format(t + 1, train_loss, val_loss, top1.avg, top5.avg, top10.avg, optimizer.param_groups[0]['lr'], batch_time.avg)) # Wandb log run.log({ 'train_loss': train_loss, 'val_loss': val_loss, 'top1_acc': top1.avg, 'top5_acc': top5.avg, 'top10_acc': top10.avg, 'lr': optimizer.param_groups[0]['lr'] }) # Scheduler step if args.use_lr_scheduler: scheduler.step(val_loss)
utils.create_dir(opts['work_dir']) utils.create_dir(os.path.join(opts['work_dir'], 'checkpoints')) if opts['e_noise'] == 'gaussian' and opts['pz'] != 'normal': assert False, 'Gaussian encoders compatible only with Gaussian prior' with utils.o_gfile((opts['work_dir'], 'params.txt'), 'w') as text: text.write('Parameters:\n') for key in opts: text.write('%s : %s\n' % (key, opts[key])) data = DataHandler(opts, seed) model = DGC(opts, tag) model.train(data) if __name__ == '__main__': os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = utils.get_free_gpu(1) os.environ["OMP_NUM_THREADS"] = "8" parser = argparse.ArgumentParser() parser.add_argument("--exp", default='mnist', help='dataset [mnist/cifar10]') FLAGS = parser.parse_args() dataset_name = FLAGS.exp for seed in range(0, 1): tag = '%s_seed%02d' % (dataset_name, seed) main(tag, seed, dataset_name)
if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Saving model to {output_dir}") model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) # torch.save(args, os.path.join(output_dir, 'training_args.bin')) # model = model_class.from_pretrained(output_dir) # tokenizer = tokenizer_class.from_pretrained(output_dir) # model.to(device) return if torch.cuda.is_available(): device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) valid_gpu_idx = get_free_gpu(5012) num_gpu = len(valid_gpu_idx) if num_gpu == 0: print("No GPU available!") exit(-1) os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(gpu_id) for gpu_id in valid_gpu_idx]) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu")
logs = {"val_loss": loss} return logs def validation_epoch_end(self, outputs): val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() logs = {'val_loss': val_loss_mean} return {**logs, "log": logs} if __name__ == '__main__': CUDA_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES") if CUDA_DEVICES: gpus = [CUDA_DEVICES.split(",")] else: gpus = [get_free_gpu()] with open("tokens.json", mode="r") as f: conf = json.load(f) # search structure # in_channels, out_channels, kernel_size, stride, padding conv_layers_grid = [ [(1, 32, 8, 4, 2), (32, 64, 4, 2, 1), (64, 128, 4, 2, 1), (128, 128, 4, 2, 1)], # [ # (1, 15, 4, 2, 1), # (15, 30, 4, 2, 1), # (30, 50, 4, 2, 1), # (50, 100, 4, 2, 1), # (100, 100, 4, 2, 1),