def train(model, train_datasets, opts, wandb=None): samples_per_epoch = opts["batch_size"] * opts["steps_per_epoch"] callbacks = [ ThroughputCallback(samples_per_epoch=samples_per_epoch), CompilationTimeCallback(), LearningRateLogger() ] if wandb is not None: callbacks.append(WandbCallback()) if opts["log_dir"]: prefix = datetime.now().strftime('%Y%m%d%H%M')[2:] ckpt_path = opts["log_dir"] + f"/{prefix}" if not os.path.isdir(ckpt_path): os.makedirs(ckpt_path, exist_ok=True) ckptfile = ckpt_path + "/epoch{epoch:02d}.h5" callbacks.append( ModelCheckpoint(epochs_per_save=opts["epochs_per_save"], filepath=ckptfile, save_best_only=False, save_weights_only=True, verbose=1)) history = model.fit(train_datasets(), epochs=opts["epochs"], steps_per_epoch=opts["steps_per_epoch"], callbacks=callbacks, verbose=2) if wandb is not None: for i in range(1, opts["epochs"] + 1): wandb.log({ "epochs": i, "loss_train": history.history["loss"][i - 1], }) return history
def main(): parser = OptionParser() parser.add_option('-j', '--workers', dest='workers', default=16, type='int', help='number of data loading workers (default: 16)') parser.add_option('-e', '--epochs', dest='epochs', default=80, type='int', help='number of epochs (default: 80)') parser.add_option('-b', '--batch-size', dest='batch_size', default=16, type='int', help='batch size (default: 16)') parser.add_option('-c', '--ckpt', dest='ckpt', default=False, help='load checkpoint model (default: False)') parser.add_option('-v', '--verbose', dest='verbose', default=100, type='int', help='show information for each <verbose> iterations (default: 100)') parser.add_option('--lr', '--learning-rate', dest='lr', default=1e-3, type='float', help='learning rate (default: 1e-3)') parser.add_option('--sf', '--save-freq', dest='save_freq', default=1, type='int', help='saving frequency of .ckpt models (default: 1)') parser.add_option('--sd', '--save-dir', dest='save_dir', default='./models/wsdan/', help='saving directory of .ckpt models (default: ./models/wsdan)') parser.add_option('--ln', '--log-name', dest='log_name', default='train.log', help='log name (default: train.log)') parser.add_option('--mn', '--model-name', dest='model_name', default='model.ckpt', help='model name (default:model.ckpt)') parser.add_option('--init', '--initial-training', dest='initial_training', default=1, type='int', help='train from 1-beginning or 0-resume training (default: 1)') (options, args) = parser.parse_args() ################################## # Initialize saving directory ################################## if not os.path.exists(options.save_dir): os.makedirs(options.save_dir) ################################## # Logging setting ################################## logging.basicConfig( filename=os.path.join( options.save_dir, options.log_name), filemode='w', format='%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s', level=logging.INFO) warnings.filterwarnings("ignore") ################################## # Load dataset ################################## image_size = (256,256) num_classes = 4 transform = transforms.Compose([transforms.Resize(size=image_size), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) train_dataset = CustomDataset(data_root='/mnt/HDD/RFW/train/data/',csv_file='data/RFW_Train40k_Images_Metada.csv',transform=transform) val_dataset = CustomDataset(data_root='/mnt/HDD/RFW/train/data/',csv_file='data/RFW_Val4k_Images_Metadata.csv',transform=transform) test_dataset = CustomDataset(data_root='/mnt/HDD/RFW/test/data/',csv_file='data/RFW_Test_Images_Metadata.csv',transform=transform) train_loader = DataLoader(train_dataset, batch_size=options.batch_size, shuffle=True,num_workers=options.workers, pin_memory=True) validate_loader = DataLoader(val_dataset, batch_size=options.batch_size * 4, shuffle=False,num_workers=options.workers, pin_memory=True) test_loader = DataLoader(test_dataset, batch_size=options.batch_size * 4, shuffle=False,num_workers=options.workers, pin_memory=True) ################################## # Initialize model ################################## logs = {} start_epoch = 0 num_attentions = 32 feature_net = inception_v3(pretrained=True) net = WSDAN(num_classes=num_classes, M=num_attentions, net='inception_mixed_6e', pretrained=True) # feature_center: size of (#classes, #attention_maps * #channel_features) feature_center = torch.zeros(num_classes, num_attentions * net.num_features).to(device) if options.ckpt: # Load ckpt and get state_dict checkpoint = torch.load(options.ckpt) # Get epoch and some logs logs = checkpoint['logs'] start_epoch = int(logs['epoch']) # Load weights state_dict = checkpoint['state_dict'] net.load_state_dict(state_dict) logging.info('Network loaded from {}'.format(options.ckpt)) # load feature center if 'feature_center' in checkpoint: feature_center = checkpoint['feature_center'].to(device) logging.info('feature_center loaded from {}'.format(options.ckpt)) logging.info('Network weights save to {}'.format(options.save_dir)) feature_net = inception_v3(pretrained=True) if options.ckpt: ckpt = options.ckpt if options.initial_training == 0: # Get Name (epoch) epoch_name = (ckpt.split('/')[-1]).split('.')[0] start_epoch = int(epoch_name) # Load ckpt and get state_dict checkpoint = torch.load(ckpt) state_dict = checkpoint['state_dict'] # Load weights net.load_state_dict(state_dict) logging.info('Network loaded from {}'.format(options.ckpt)) # load feature center if 'feature_center' in checkpoint: feature_center = checkpoint['feature_center'].to(torch.device("cuda")) logging.info('feature_center loaded from {}'.format(options.ckpt)) ################################## # Use cuda ################################## net.to(device) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) ################################## # Optimizer, LR Scheduler ################################## learning_rate = logs['lr'] if 'lr' in logs else options.lr optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=2) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9) ################################## # ModelCheckpoint ################################## callback_monitor = 'val_{}'.format(raw_metric.name) callback = ModelCheckpoint(savepath=os.path.join(options.save_dir, options.model_name), monitor=callback_monitor, mode='max') if callback_monitor in logs: callback.set_best_score(logs[callback_monitor]) else: callback.reset() ################################## # TRAINING ################################## logging.info('') logging.info('Start training: Total epochs: {}, Batch size: {}, Training size: {}, Validation size: {}'. format(options.epochs, options.batch_size, len(train_dataset), len(val_dataset))) for epoch in range(start_epoch, options.epochs): callback.on_epoch_begin() logs['epoch'] = epoch + 1 logs['lr'] = optimizer.param_groups[0]['lr'] logging.info('Epoch {:03d}, Learning Rate {:g}'.format(epoch + 1, optimizer.param_groups[0]['lr'])) pbar = tqdm(total=len(train_loader), unit=' batches') pbar.set_description('Epoch {}/{}'.format(epoch + 1, options.epochs)) train(logs=logs, data_loader=train_loader, net=net, feature_center=feature_center, optimizer=optimizer, pbar=pbar) validate(logs=logs, data_loader=validate_loader, net=net, pbar=pbar) if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): scheduler.step(logs['val_loss']) else: scheduler.step() callback.on_epoch_end(logs, net, feature_center=feature_center) pbar.close()
weights = [0.5, 0.5, 0] loss = weighted_categorical_crossentropy(weights) model = unet((rows, cols, channels)) model.compile(optimizer=adam, loss=loss, metrics=['accuracy']) # print model information model.summary() filepath = 'models/' # define early stopping callback earlystop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=10, verbose=1, mode='min') checkpoint = ModelCheckpoint(filepath + 'unet_exp_' + str(exp) + '.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [earlystop, checkpoint] # train the model start_training = time.time() model_info = model.fit(patches_tr_aug, patches_tr_ref_aug_h, batch_size=batch_size, epochs=100, callbacks=callbacks_list, verbose=2, validation_data=(patches_val_aug, patches_val_ref_aug_h)) end_training = time.time() - start_time #%% Test model
def run_train(args): processor = BertProcessor(vocab_path=os.path.join( args.pretrained_model, 'vocab.txt', ), test_mode=args.test_mode, do_lower_case=args.do_lower_case) #processor.tokenizer.save_vocabulary (str (args.model_path)) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} train_cache_sample = config['data_dir'] / f"cached_train_seq_examples" train_cache_feature = config['data_dir'] / f"cached_train_seq_features" if args.type: train_data = processor.read_type_data(os.path.join( config['data_dir'], "train.jsonl"), type=args.type) valid_data = processor.read_type_data(os.path.join( config['data_dir'], "dev.jsonl"), type=args.type) train_cache_sample = config[ 'data_dir'] / f"cached_train_seq_examples_{args.type}" train_cache_feature = config[ 'data_dir'] / f"cached_train_seq_features_{args.type}" else: train_data = processor.read_data( os.path.join(config['data_dir'], "train.jsonl")) valid_data = processor.read_data( os.path.join(config['data_dir'], "dev.jsonl")) if args.early_stop: early_stopping = EarlyStopping(patience=3, monitor="f1", baseline=0, mode='max') else: early_stopping = None train_dataset = convert_data_to_tensor( processor=processor, args=args, data=train_data, type="train", cache_sample_path=train_cache_sample, cache_feature_path=train_cache_feature, save_cache=False) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_dataset = convert_data_to_tensor( processor=processor, args=args, data=valid_data, type="dev", cache_sample_path=config['data_dir'] / f"cached_dev_seq_examples", cache_feature_path=config['data_dir'] / f"cached_dev_seq_features", save_cache=False) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # model = BERTCRF # # bert_config = BertConfig.from_json_file(os.path.join(args.pretrained_model,"config.json")) # bert_config.num_hidden_layers = args.depth # if args.resume_path: # args.resume_path = Path (args.resume_path) # model = model.from_pretrained (args.resume_path, label2id=label2id, device=args.device,config=bert_config) # # else: # model = model.from_pretrained (args.pretrained_model, label2id=label2id, device=args.device,config=bert_config) bert_config = BertConfig.from_json_file( os.path.join(args.pretrained_model, "config.json")) model = CNNLSTMCRF(config=bert_config, label2id=label2id, device=args.device) ckpt = torch.load(os.path.join(args.pretrained_model, "pytorch_model.bin")) if "state_dict" in ckpt: state_dict = ckpt["state_dict"] else: state_dict = ckpt for key in list(state_dict.keys()): if 'embedding' in key: new_key = key.replace("bert.embeddings.", "") # delete 'bert.' state_dict[new_key] = state_dict.pop(key) try: model.BERTEm.load_state_dict(state_dict, strict=True) except Exception as e: print(e) model = model.to(args.device) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) optimizer = RMSprop(model.parameters(), lr=args.learning_rate) lr_scheduler = BERTReduceLROnPlateau(optimizer, lr=args.learning_rate, mode=args.mode, factor=0.5, patience=1, verbose=1, epsilon=1e-8, cooldown=0, min_lr=0, eps=1e-8) model_checkpoint = ModelCheckpoint(checkpoint_dir=args.model_path, mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) tb_logger = Tensorboard_Logger( log_dir=os.path.join(args.model_path, config['output'])) trainer = Trainer( n_gpu=args.n_gpu, model=model, logger=logger, tb_logger=tb_logger, optimizer=optimizer, lr_scheduler=lr_scheduler, label2id=label2id, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, early_stopping=early_stopping, partial=args.partial, trigger=args.trigger) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, epochs=args.epochs, seed=args.seed)
nll_loss = nll_loss.squeeze(1) smooth_loss = -logprobs.mean(dim=-1) loss = confidence * nll_loss + self.smoothing * smooth_loss return loss.mean() if use_lsmooth: train_loss = LabelSmoothingCrossEntropy(smoothing=0.1) else: train_loss = nn.CrossEntropyLoss() loss = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=base_lrate, momentum=0.9) metrics = {'CE': loss, 'accuracy': accuracy} scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) model_checkpoint = ModelCheckpoint(model, 'best_model_weights.pt') train_metrics_history = {'times': [], 'loss': [], 'acc': []} val_metrics_history = {'times': [], 'loss': [], 'acc': []} max_epochs = 150 for epoch in range(max_epochs): # loop over the dataset multiple times train(model, train_loader, train_loss, optimizer, device, metrics) scheduler.step() train_metrics = test(model, train_loader, device, metrics) train_metrics_history['times'].append(epoch + 1) train_metrics_history['acc'].append(train_metrics['accuracy']) train_metrics_history['loss'].append(train_metrics['CE'])
batch_size=180, num_workers=2, pin_memory=True) encoder = Resnet18_Encoder().to(device) optimizer = optim.SGD(encoder.parameters(), lr=lr, momentum=0.9) if resume_from_previous_state: previous = torch.load(previous_state_path) encoder.load_state_dict(previous['model']) optimizer.load_state_dict(previous['optimizer']) memory = Memory(size=len(dataset), weight=0.5, device=device) memory.initialize(encoder, train_loader) checkpoint = ModelCheckpoint(mode='min', directory=checkpoint_dir) noise_contrastive_estimator = NoiseContrastiveEstimator(device) logger = Logger(log_filename) loss_weight = 0.5 ########################### ######## TRAINING ######### ########################### for epoch in range(starting_epoch + 1, 1000): print('\nEpoch: {}'.format(epoch)) memory.update_weighted_count() train_loss = AverageMeter('train_loss') bar = Progbar(len(train_loader), stateful_metrics=['train_loss', 'valid_loss'])
def main(): train_x, train_y, valid_x, valid_y, test_x, test_y = get_cifar10('./cifar-10-batches-py/') labels = unpickle('./cifar-10-batches-py/batches.meta')['label_names'] train_x = train_x.astype(np.float32) / 255.0 valid_x = valid_x.astype(np.float32) / 255.0 test_x = test_x.astype(np.float32) / 255.0 num_epochs = args.epochs eta = args.lr batch_size = args.batch_size # input x = T.tensor4("x") y = T.ivector("y") # test values # x.tag.test_value = np.random.randn(6, 3, 32, 32).astype(np.float32) # y.tag.test_value = np.array([1,2,1,4,5]).astype(np.int32) # x.tag.test_value = x.tag.test_value / x.tag.test_value.max() # import ipdb; ipdb.set_trace() # network definition conv1 = BinaryConv2D(input=x, num_filters=50, input_channels=3, size=3, strides=(1,1), padding=1, name="conv1") act1 = Activation(input=conv1.output, activation="relu", name="act1") pool1 = Pool2D(input=act1.output, stride=(2,2), name="pool1") conv2 = BinaryConv2D(input=pool1.output, num_filters=100, input_channels=50, size=3, strides=(1,1), padding=1, name="conv2") act2 = Activation(input=conv2.output, activation="relu", name="act2") pool2 = Pool2D(input=act2.output, stride=(2,2), name="pool2") conv3 = BinaryConv2D(input=pool2.output, num_filters=200, input_channels=100, size=3, strides=(1,1), padding=1, name="conv3") act3 = Activation(input=conv3.output, activation="relu", name="act3") pool3 = Pool2D(input=act3.output, stride=(2,2), name="pool3") flat = Flatten(input=pool3.output) fc1 = BinaryDense(input=flat.output, n_in=200*4*4, n_out=500, name="fc1") act4 = Activation(input=fc1.output, activation="relu", name="act4") fc2 = BinaryDense(input=act4.output, n_in=500, n_out=10, name="fc2") softmax = Activation(input=fc2.output, activation="softmax", name="softmax") # loss xent = T.nnet.nnet.categorical_crossentropy(softmax.output, y) cost = xent.mean() # errors y_pred = T.argmax(softmax.output, axis=1) errors = T.mean(T.neq(y, y_pred)) # updates + clipping (+-1) params = conv1.params + conv2.params + conv3.params + fc1.params + fc2.params params_bin = conv1.params_bin + conv2.params_bin + conv3.params_bin + fc1.params_bin + fc2.params_bin grads = [T.grad(cost, param) for param in params_bin] # calculate grad w.r.t binary parameters updates = [] for p,g in zip(params, grads): updates.append( (p, clip_weights(p - eta*g)) #sgd + clipping update ) # compiling train, predict and test fxns train = theano.function( inputs = [x,y], outputs = cost, updates = updates ) predict = theano.function( inputs = [x], outputs = y_pred ) test = theano.function( inputs = [x,y], outputs = errors ) # train checkpoint = ModelCheckpoint(folder="snapshots") logger = Logger("logs/{}".format(time())) for epoch in range(num_epochs): print "Epoch: ", epoch print "LR: ", eta epoch_hist = {"loss": []} t = tqdm(range(0, len(train_x), batch_size)) for lower in t: upper = min(len(train_x), lower + batch_size) loss = train(train_x[lower:upper], train_y[lower:upper].astype(np.int32)) t.set_postfix(loss="{:.2f}".format(float(loss))) epoch_hist["loss"].append(loss.astype(np.float32)) # epoch loss average_loss = sum(epoch_hist["loss"])/len(epoch_hist["loss"]) t.set_postfix(loss="{:.2f}".format(float(average_loss))) logger.log_scalar( tag="Training Loss", value= average_loss, step=epoch ) # validation accuracy val_acc = 1.0 - test(valid_x, valid_y.astype(np.int32)) print "Validation Accuracy: ", val_acc logger.log_scalar( tag="Validation Accuracy", value= val_acc, step=epoch ) checkpoint.check(val_acc, params) # Report Results on test set (w/ best val acc file) best_val_acc_filename = checkpoint.best_val_acc_filename print "Using ", best_val_acc_filename, " to calculate best test acc." load_model(path=best_val_acc_filename, params=params) test_acc = 1.0 - test(test_x, test_y.astype(np.int32)) print "Test accuracy: ",test_acc
def main(): ################################## # Initialize saving directory ################################## if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) ################################## # Logging setting ################################## logging.basicConfig( filename=os.path.join(config.save_dir, config.log_name), filemode='w', format= '%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s', level=logging.INFO) warnings.filterwarnings("ignore") ################################## # Load dataset ################################## # train_dataset, validate_dataset = get_trainval_datasets(config.tag, config.image_size) full_train_dataset = CarDataset('train') n = len(full_train_dataset) # train_dataset, validate_dataset = torch.utils.data.random_split(full_train_dataset, [int(n*0.8), n-int(n*0.8)]) train_dataset = full_train_dataset validate_dataset = full_train_dataset train_loader, validate_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True), \ DataLoader(validate_dataset, batch_size=config.batch_size * 4, shuffle=False, num_workers=config.workers, pin_memory=True) num_classes = full_train_dataset.num_classes ################################## # Initialize model ################################## logs = {} start_epoch = 0 net = WSDAN(num_classes=num_classes, M=config.num_attentions, net=config.net, pretrained=True) # feature_center: size of (#classes, #attention_maps * #channel_features) feature_center = torch.zeros(num_classes, config.num_attentions * net.num_features).to(device) if config.ckpt: # Load ckpt and get state_dict checkpoint = torch.load(config.ckpt) # Get epoch and some logs logs = checkpoint['logs'] start_epoch = int(logs['epoch']) # Load weights state_dict = checkpoint['state_dict'] net.load_state_dict(state_dict) logging.info('Network loaded from {}'.format(config.ckpt)) # load feature center if 'feature_center' in checkpoint: feature_center = checkpoint['feature_center'].to(device) logging.info('feature_center loaded from {}'.format(config.ckpt)) logging.info('Network weights save to {}'.format(config.save_dir)) ################################## # Use cuda ################################## net.to(device) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) ################################## # Optimizer, LR Scheduler ################################## learning_rate = logs['lr'] if 'lr' in logs else config.learning_rate optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=2) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9) ################################## # ModelCheckpoint ################################## callback_monitor = 'val_{}'.format(raw_metric.name) callback = ModelCheckpoint(savepath=os.path.join(config.save_dir, config.model_name), monitor=callback_monitor, mode='max') if callback_monitor in logs: callback.set_best_score(logs[callback_monitor]) else: callback.reset() ################################## # TRAINING ################################## logging.info( 'Start training: Total epochs: {}, Batch size: {}, Training size: {}, Validation size: {}' .format(config.epochs, config.batch_size, len(train_dataset), len(validate_dataset))) logging.info('') for epoch in range(start_epoch, config.epochs): callback.on_epoch_begin() logs['epoch'] = epoch + 1 logs['lr'] = optimizer.param_groups[0]['lr'] logging.info('Epoch {:03d}, Learning Rate {:g}'.format( epoch + 1, optimizer.param_groups[0]['lr'])) pbar = tqdm(total=len(train_loader), unit=' batches') pbar.set_description('Epoch {}/{}'.format(epoch + 1, config.epochs)) train(logs=logs, data_loader=train_loader, net=net, feature_center=feature_center, optimizer=optimizer, pbar=pbar) validate(logs=logs, data_loader=validate_loader, net=net, pbar=pbar) if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): scheduler.step(logs['val_loss']) else: scheduler.step() callback.on_epoch_end(logs, net, feature_center=feature_center) pbar.close()
def run_train(args): processor = BertProcessor(vocab_path=os.path.join( args.pretrained_model, 'vocab.txt', ), test_mode=args.test_mode, do_lower_case=args.do_lower_case) #processor.tokenizer.save_vocabulary (str (args.model_path)) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} train_cache_sample = config['data_dir'] / f"cached_train_seq_examples" train_cache_feature = config['data_dir'] / f"cached_train_seq_features" if args.type: train_data = processor.read_type_data(os.path.join( config['data_dir'], "train.jsonl"), type=args.type) valid_data = processor.read_type_data(os.path.join( config['data_dir'], "dev.jsonl"), type=args.type) train_cache_sample = config[ 'data_dir'] / f"cached_train_seq_examples_{args.type}" train_cache_feature = config[ 'data_dir'] / f"cached_train_seq_features_{args.type}" else: train_data = processor.read_data( os.path.join(config['data_dir'], "train.jsonl")) valid_data = processor.read_data( os.path.join(config['data_dir'], "dev.jsonl")) if args.early_stop: early_stopping = EarlyStopping(patience=3, monitor="f1", baseline=0, mode='max') else: early_stopping = None train_dataset = convert_data_to_tensor( processor=processor, args=args, data=train_data, type="train", cache_sample_path=train_cache_sample, cache_feature_path=train_cache_feature, save_cache=False) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_dataset = convert_data_to_tensor( processor=processor, args=args, data=valid_data, type="dev", cache_sample_path=config['data_dir'] / f"cached_dev_seq_examples", cache_feature_path=config['data_dir'] / f"cached_dev_seq_features", save_cache=False) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) model = BERTCRF bert_config = BertConfig.from_json_file( os.path.join(args.pretrained_model, "config.json")) bert_config.num_hidden_layers = args.depth if args.resume_path: args.resume_path = Path(args.resume_path) model = model.from_pretrained(args.resume_path, label2id=label2id, device=args.device, config=bert_config) else: model = model.from_pretrained(args.pretrained_model, label2id=label2id, device=args.device, config=bert_config) model = model.to(args.device) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) bert_param_optimizer = list(model.bert.named_parameters()) crf_param_optimizer = list(model.crf.named_parameters()) linear_param_optimizer = list(model.classifier.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': 0.001 }, { 'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': 0.001 }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': 0.001 }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': 0.001 }] if args.optimizer == 'adam': optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) else: raise ValueError("unknown optimizer") lr_scheduler = BERTReduceLROnPlateau(optimizer, lr=args.learning_rate, mode=args.mode, factor=0.5, patience=1, verbose=1, epsilon=1e-8, cooldown=0, min_lr=0, eps=1e-8) model_checkpoint = ModelCheckpoint(checkpoint_dir=args.model_path, mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num Epochs = %d", args.epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) tb_logger = Tensorboard_Logger( log_dir=os.path.join(args.model_path, config['output'])) trainer = Trainer( n_gpu=args.n_gpu, model=model, logger=logger, tb_logger=tb_logger, optimizer=optimizer, lr_scheduler=lr_scheduler, label2id=label2id, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, early_stopping=early_stopping, partial=args.partial, trigger=args.trigger) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, epochs=args.epochs, seed=args.seed)
mode 0: get 128 feature for image, mode 1: get 128 feature for image and patches ''' if mode == 0: return self.return_reduced_image_features(images) if mode == 1: return self.return_reduced_image_patches_features(images, patches) net = Network().to(device) optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9) memory = Memory(size=len(dataset), weight=0.5, device=device) memory.initialize(net, train_loader) checkpoint = ModelCheckpoint(mode='min', directory=checkpoint_dir) noise_contrastive_estimator = NoiseContrastiveEstimator(device) logger = Logger(log_filename) loss_weight = 0.5 for epoch in range(1000): print('\nEpoch: {}'.format(epoch)) memory.update_weighted_count() train_loss = AverageMeter('train_loss') bar = Progbar(len(train_loader), stateful_metrics=['train_loss', 'valid_loss']) for step, batch in enumerate(train_loader): # prepare batch
model.compile(optimizer=optimizer, loss=loss.compute) num_iterations_per_epoch = num_training_samples // args["batch_size"] if args["checkpoint_type"] == "epoch": assert args["checkpoint_frequency"] < args[ "epochs"], "checkpoint_frequency must be smaller than epochs." elif args["checkpoint_type"] == "iteration": assert args["checkpoint_frequency"] < num_iterations_per_epoch * args[ "epochs"], "checkpoint_frequency must be smaller than num_iterations_per_epoch * args_epochs" if args["checkpoint"] is not None: assert os.path.exists(args["checkpoint"]), "checkpoint does not exist" model.load_weights(args["checkpoint"], by_name=True) model.fit(x=training_data_generator, validation_data=validation_data_generator, batch_size=args["batch_size"], validation_batch_size=args["batch_size"], epochs=args["epochs"], initial_epoch=args["initial_epoch"], callbacks=[ ModelCheckpoint( initial_epoch=args["initial_epoch"], output_dir=args["output_dir"], epoch_frequency=args["checkpoint_frequency"] if args["checkpoint_type"] == "epoch" else None, iteration_frequency=args["checkpoint_frequency"] if args["checkpoint_type"] == "iteration" else None, ) ])
def main(): train_x, train_y, valid_x, valid_y, test_x, test_y = get_mnist() num_epochs = args.epochs eta = args.lr batch_size = args.batch_size # input x = T.matrix("x") y = T.ivector("y") #x.tag.test_value = np.random.randn(3, 784).astype("float32") #y.tag.test_value = np.array([1,2,3]) #drop_switch.tag.test_value = 0 #import ipdb; ipdb.set_trace() hidden_1 = BinaryDense(input=x, n_in=784, n_out=2048, name="hidden_1") act_1 = Activation(input=hidden_1.output, activation="relu", name="act_1") hidden_2 = BinaryDense(input=act_1.output, n_in=2048, n_out=2048, name="hidden_2") act_2 = Activation(input=hidden_2.output, activation="relu", name="act_2") hidden_3 = BinaryDense(input=act_2.output, n_in=2048, n_out=2048, name="hidden_3") act_3 = Activation(input=hidden_3.output, activation="relu", name="act_3") output = BinaryDense(input=act_3.output, n_in=2048, n_out=10, name="output") softmax = Activation(input=output.output, activation="softmax", name="softmax") # loss xent = T.nnet.nnet.categorical_crossentropy(softmax.output, y) cost = xent.mean() # errors y_pred = T.argmax(softmax.output, axis=1) errors = T.mean(T.neq(y, y_pred)) # updates + clipping (+-1) params_bin = hidden_1.params_bin + hidden_2.params_bin + hidden_3.params_bin params = hidden_1.params + hidden_2.params + hidden_3.params grads = [T.grad(cost, param) for param in params_bin] # calculate grad w.r.t binary parameters updates = [] for p, g in zip( params, grads ): # gradient update on full precision weights (NOT binarized wts) updates.append((p, clip_weights(p - eta * g)) #sgd + clipping update ) # compiling train, predict and test fxns train = theano.function(inputs=[x, y], outputs=cost, updates=updates) predict = theano.function(inputs=[x], outputs=y_pred) test = theano.function(inputs=[x, y], outputs=errors) # train checkpoint = ModelCheckpoint(folder="snapshots") logger = Logger("logs/{}".format(time())) for epoch in range(num_epochs): print "Epoch: ", epoch print "LR: ", eta epoch_hist = {"loss": []} t = tqdm(range(0, len(train_x), batch_size)) for lower in t: upper = min(len(train_x), lower + batch_size) loss = train(train_x[lower:upper], train_y[lower:upper].astype(np.int32)) t.set_postfix(loss="{:.2f}".format(float(loss))) epoch_hist["loss"].append(loss.astype(np.float32)) # epoch loss average_loss = sum(epoch_hist["loss"]) / len(epoch_hist["loss"]) t.set_postfix(loss="{:.2f}".format(float(average_loss))) logger.log_scalar(tag="Training Loss", value=average_loss, step=epoch) # validation accuracy val_acc = 1.0 - test(valid_x, valid_y.astype(np.int32)) print "Validation Accuracy: ", val_acc logger.log_scalar(tag="Validation Accuracy", value=val_acc, step=epoch) checkpoint.check(val_acc, params) # Report Results on test set best_val_acc_filename = checkpoint.best_val_acc_filename print "Using ", best_val_acc_filename, " to calculate best test acc." load_model(path=best_val_acc_filename, params=params) test_acc = 1.0 - test(test_x, test_y.astype(np.int32)) print "Test accuracy: ", test_acc
def main(): ################################## # Initialize saving directory ################################## if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) ################################## # Logging setting ################################## logging.basicConfig( filename=os.path.join(config.save_dir, config.log_name), filemode='w', format= '%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s', level=logging.INFO) warnings.filterwarnings("ignore") ################################## # Load dataset ################################## # train_data_size = int(config.num_image * 0.8) # val_data_size = config.num_image - train_data_size indices_train = np.random.RandomState(777).permutation(500) indices_test = np.random.RandomState(777).permutation(140) print(indices_test) print('tran_data_size', len(indices_train), 'val_data_size', len(indices_test)) train_dataset = CervicalDataset(phase='train', resize=config.image_size, indices=indices_train) validate_dataset = CervicalDataset(phase='val', resize=config.image_size, indices=indices_test) train_loader, validate_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True), \ DataLoader(validate_dataset, batch_size=config.batch_size * 4, shuffle=False, num_workers=config.workers, pin_memory=True) num_classes = train_dataset.num_classes ################################## # Initialize model ################################## logs = {} start_epoch = 0 #net = torch.hub.load('moskomule/senet.pytorch', 'se_resnet50', pretrained=True,) net = res2net50(pretrained=True, num_classes=2) print(net) net.aux_logits = False # i=0 # for m in net.children(): # if isinstance(m, nn.Conv2d) and i < 3: # for param in m.parameters(): # param.requires_grad=False # i = i+1 # print(net) # feature_center: size of (#classes, #attention_maps * #channel_features) # feature_center = torch.zeros(num_classes, config.num_attentions * net.num_features).to(device) if config.ckpt: # Load ckpt and get state_dict checkpoint = torch.load(config.ckpt) # Get epoch and some logs logs = checkpoint['logs'] start_epoch = int(logs['epoch']) # Load weights state_dict = checkpoint['state_dict'] net.load_state_dict(state_dict) logging.info('Network loaded from {}'.format(config.ckpt)) # load feature center if 'feature_center' in checkpoint: feature_center = checkpoint['feature_center'].to(device) logging.info('feature_center loaded from {}'.format(config.ckpt)) logging.info('Network weights save to {}'.format(config.save_dir)) ################################## # Use cuda ################################## net.to(device) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) ################################## # Optimizer, LR Scheduler ################################## learning_rate = config.learning_rate optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=learning_rate, momentum=0.9, weight_decay=1e-5) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.9, patience=2) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9) #net, optimizer = amp.initialize(net, optimizer, opt_level='O1') ################################## # ModelCheckpoint ################################## callback_monitor = 'val_{}'.format(raw_metric.name) callback = ModelCheckpoint(savepath=os.path.join(config.save_dir, config.model_name), monitor=callback_monitor, mode='max') if callback_monitor in logs: callback.set_best_score(logs[callback_monitor]) else: callback.reset() ################################## # TRAINING ################################## logging.info( 'Start training: Total epochs: {}, Batch size: {}, Training size: {}, Validation size: {}' .format(config.epochs, config.batch_size, len(train_dataset), len(validate_dataset))) logging.info('') for epoch in range(start_epoch, config.epochs): callback.on_epoch_begin() logs['epoch'] = epoch + 1 logs['lr'] = optimizer.param_groups[0]['lr'] logging.info('Epoch {:03d}, Learning Rate {:g}'.format( epoch + 1, optimizer.param_groups[0]['lr'])) pbar = tqdm(total=len(train_loader), unit=' batches') pbar.set_description('{}/{}'.format(epoch + 1, config.epochs)) train(logs=logs, data_loader=train_loader, net=net, optimizer=optimizer, pbar=pbar) validate(logs=logs, data_loader=validate_loader, net=net, pbar=pbar) if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): scheduler.step(logs['val_loss']) else: scheduler.step() callback.on_epoch_end(logs, net) torch.save(net.state_dict(), os.path.join(config.save_dir, config.model_name)) pbar.close()
"Val loss: {:.3f}..".format(val_loss_epoch), "Val Accuracy: {:.3f}".format(val_accuracy_epoch)) tensorboard_writer.add_scalar('metrics/train_loss', train_loss_epoch, e) tensorboard_writer.add_scalar('metrics/val_loss', val_loss_epoch, e) tensorboard_writer.add_scalar('metrics/val_acc', val_accuracy_epoch, e) # saving best model state = { 'epoch': e + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } filepath = os.path.join(config.logdir, "best_model.pt") model_checkpoint = ModelCheckpoint(filepath, state) model_checkpoint.update(val_loss_epoch) end_epoch = time.time() print('time for epoch: ', (end_epoch - start_epoch)) print('\n') # saving model after every 5 iterations if (e % 5 == 0): file_name = "model_" + str(e+1) + '-' + "{:.3f}".format(val_loss_epoch) \ + '-' + "{:.3f}".format(val_accuracy_epoch) + '.pt' filepath = os.path.join(config.logdir, file_name) torch.save(state, filepath) plt.plot(train_losses, label="Training loss") plt.plot(val_losses, label="Validation loss")
# configure loss and optimizers criterion = nn.BCEWithLogitsLoss() opt_gen = torch.optim.Adam(gen.parameters(), lr=LR, betas=(beta1, beta2)) opt_disc = torch.optim.Adam(critic.parameters(), lr=LR, betas=(beta1, beta2)) # configure tensorboard writer repo = git.Repo(search_parent_directories=True) sha = repo.head.object.hexsha[:6] logdir = f"/home/bishwarup/GAN_experiments/dcgan/{sha}" writer = SummaryWriter(log_dir=logdir) # make a fixed noise to see the generator evolve over time on it fixed_noise = gen_noise(32, NOISE_DIM, device=device) # train loop checkpointer = ModelCheckpoint(logdir, freq=CKPT_FREQ, keep_n=KEEP_LAST_N_CKPT) best_fid = np.inf for epoch in range(EPOCHS): torch.cuda.empty_cache() gen.train() critic.train() lossD = AverageMeter("LossD") lossG = AverageMeter("LossG") global_step = 0 pbar = tqdm(enumerate(loader)) for n_iter, (real, _) in pbar: