def main(): # Set the random seed for reproducible experiments torch.manual_seed(230) parser = argparse.ArgumentParser() parser.add_argument('--data_dir', help="Directory containing the dataset") parser.add_argument('--model_dir', help="Directory containing params.json") parser.add_argument('--params', help='Directory containing params.json') parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \ containing weights to load") params = utils.Params(args.params) # Get the logger utils.set_logger(os.path.join(params.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") test_dataset = dataset(file_path=params.metadata_file, split="Test", classes=params.classes) test_loader = DataLoader(dataset=test_dataset, batch_size=params.batch_size, shuffle=True, num_workers=8) logging.info("- done.") # Define the model and optimizer if model != "Inception": net = importlib.import_module("features.models.{}".format( params.model)) model = net.Net() inception = False else: model = models.inception_v3(pretrained=False) model.fc = nn.Linear(2048, num_classes) model.AuxLogits.fc = nn.Linear(768, 1) inception = True model.cuda() metrics_save = metrics_code.metrics_save logging.info("Starting evaluation") # Reload weights from the saved file utils.load_checkpoint( os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model) # Evaluate test_metrics = evaluate(model, test_loader, metrics_save, experiment, inception) save_path = os.path.join(model_dir, "metrics_test_{}.json".format(restore_file)) utils.save_dict_to_json(test_metrics, save_path)
def train_single_model(params): # use GPU if available params['cuda'] = torch.cuda.is_available() # log into the appropriate directory utils.set_logger(os.path.join('train.log')) # Set the random seed for reproducible experiments torch.manual_seed(1337) if params.cuda: torch.cuda.manual_seed(1337) # dynamic import of net net = import_module('model.{}'.format(params.model_name)) model = net.Net(params).cuda() if params.cuda else net.Net(params) if params.cuda and params.multi_gpu == 1 and torch.cuda.device_count() > 1: print('Using', torch.cuda.device_count(), 'GPUs.') model = nn.DataParallel(model) # Create the input data pipeline logging.info("Loading the datasets...") dataloaders = data_loader.fetch_dataloader(['train', 'test'], data_dir, params) train_dl = dataloaders['train'] val_dl = dataloaders['test'] optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # fetch loss function and metrics loss_fn = net.loss_fn metrics = net.metrics # Train the model logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) train_and_evaluate(model, train_dl, val_dl, optimizer, loss_fn, metrics, params, 'experiments/' + params.exp_name, restore_file)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parser.parse_args() set_logger(log_file=args.log_file, debug_mode=args.debug_mode) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) cudnn.benchmark = True train_loader = VideoIter(dataset_path=args.dataset_path, annotation_path=args.annotation_path, clip_length=args.clip_length, frame_stride=args.frame_interval, video_transform=build_transforms(), name='Features extraction') train_iter = torch.utils.data.DataLoader( train_loader, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, # 4, # change this part accordingly pin_memory=True) # Loading network if args.feature_extractor == 'c3d': network = C3D(pretrained=args.pretrained_c3d) elif args.feature_extractor == 'resnet': network = resnet(200) network.load_state_dict( torch.load('network/r3d200_K_200ep.pth')['state_dict']) network = network.to(device) if not path.exists(args.save_dir): mkdir(args.save_dir) features_writer = FeaturesWriter() for i_batch, (data, target, sampled_idx, dirs, vid_names) in tqdm(enumerate(train_iter)): with torch.no_grad(): outputs = network(data.to(device)).detach().cpu().numpy() for i, (dir, vid_name, start_frame) in enumerate( zip(dirs, vid_names, sampled_idx.cpu().numpy())): dir = path.join(args.save_dir, dir) features_writer.write(feature=outputs[i], video_name=vid_name, idx=start_frame, dir=dir) features_writer.dump()
def main(problem, model, hparams, experiment_dir, restore_dir, test_dir, overwrite_results, skip_generate_data): set_random_seed(230) tf.gfile.MakeDirs(experiment_dir) # Check that we are not overwriting some previous experiment if not overwrite_results: model_dir_has_best_weights = os.path.isdir( os.path.join(experiment_dir, "best_weights")) overwriting = model_dir_has_best_weights and restore_dir is None assert not overwriting, "Weights found in model_dir, aborting to avoid overwrite" utils.set_logger(os.path.join(experiment_dir, 'train.log')) # initialize the GoTrainer my_trainer = trainer.GoTrainer(problem, model, hparams, experiment_dir, skip_generate_data) # train and evaluate network on dev split my_trainer.train_and_evaluate(restore_from=restore_dir) utils.set_logger(os.path.join(experiment_dir, 'test.log')) # evaluate the network on test split my_trainer.test(test_dir)
params.mode = args.mode if params.gpu_id >= -1: params.cuda = True # Set the random seed for reproducible experiments torch.manual_seed(params.seed) np.random.seed(params.seed) random.seed(params.seed) if params.gpu_id >= -1: torch.cuda.manual_seed(params.seed) torch.backends.cudnn.deterministic = False # must be True to if you want reproducible,but will slow the speed cudnn.benchmark = True # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 torch.cuda.empty_cache() # release cache # Set the logger utils.set_logger(os.path.join(experiment_path, 'certify_sigma_'+ str(args.sigma).replace('.','')+'.log')) logger = logging.getLogger() port,env = 8097,params.model_version columnnames,rownames = list(range(1,params.model_args["num_class"]+1)),list(range(1,params.model_args["num_class"]+1)) # log all params d_args = vars(args) for k in d_args.keys(): logging.info('{0}: {1}'.format(k, d_args[k])) d_params = vars(params) for k in d_params.keys(): logger.info('{0}: {1}'.format(k, d_params[k]))
help='async/not-async requests') parser.add_argument('--config', '-c', type=str, default=None, help='fullpath to conf.json') parser.add_argument('--datapath', '-d', type=str, default='./data/bible.txt') FLAGS = parser.parse_args() if not os.path.exists(FLAGS.folder): make_directory(FLAGS.folder) set_logger(os.path.join(FLAGS.folder, 'train.log')) if FLAGS.config is None: try: FLAGS.config = os.path.join(FLAGS.folder, 'config.json') except FileNotFoundError: raise FileNotFoundError('config.json is not found!') params = Params(jsonpath=FLAGS.config) logging.info('Start word2vec training pipeline! Params:') logging.info(json.dumps(params.__dict__, indent=True)) if params.model not in ['hier_softmax', 'neg_sampling']: raise NotImplementedError(f"{params.model} model is not supported!") # load data: logging.info('Loading data:')
DIR_DATA_ALL = DIR_CURRENT / 'data' / 'processed' / args.data_compressed DIR_DATA_EACH = DIR_CURRENT / 'data' / \ 'processed' / args.data_compressed / 'each_trial' DIR_LOG = DIR_CURRENT / 'results' / 'logs' folder_name = str(args.exp + "_" + args.decode_type + "_" + str(args.tap_size)) # TODO: Fix below checkpoint path when open sourcing the code DIR_CHECK = DIR_CURRENT / 'results' / 'checkpoints' / folder_name # DIR_CHECK = Path("/media/snakagom/UUI/Dropbox/UH/phd_aim1/results/checkpoints") / folder_name DIR_FIMPORTANCE = DIR_CURRENT / 'results' / 'feature_importance' / folder_name DIR_RESULTS_SUMMARY_EACH = DIR_CURRENT / 'results' / \ 'summary' / folder_name / 'each_trial' # Define log file log_fname = "fi_" + args.exp + "_" + args.decode_type + \ "_" + str(args.tap_size) + args.name_log log = set_logger(str(DIR_LOG), log_fname) # Check the directories to save files and create if it doesn't exist dir_maker(DIR_DATA_ALL) dir_maker(DIR_DATA_EACH) dir_maker(DIR_LOG) dir_maker(DIR_CHECK) dir_maker(DIR_FIMPORTANCE) dir_maker(DIR_RESULTS_SUMMARY_EACH) CHAN_INFO = Path("chan46.csv") assert CHAN_INFO.is_file(), "Channel location file doesn't exist." with open(str(CHAN_INFO)) as csvfile: chan_info = list(csv.reader(csvfile))[0] chan_info.append('Baseline') # We will run without any permutation at last # use GPU if available
if params.gpu_id >= -1: params.cuda = True # Set the random seed for reproducible experiments torch.manual_seed(params.seed) np.random.seed(params.seed) random.seed(params.seed) if params.gpu_id >= -1: torch.cuda.manual_seed(params.seed) torch.backends.cudnn.deterministic = False # must be True to if you want reproducible,but will slow the speed cudnn.benchmark = True # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 torch.cuda.empty_cache() # release cache # Set the logger if params.mode == 'train': utils.set_logger(os.path.join(experiment_path, 'train.log')) elif params.mode == 'test': utils.set_logger(os.path.join(experiment_path, 'test.log')) elif params.mode == 'load_train': utils.set_logger(os.path.join(experiment_path, 'load_train.log')) logger = logging.getLogger() port, env = 8098, args.visdom_env columnnames, rownames = list(range( 1, params.model_args["num_class"] + 1)), list( range(1, params.model_args["num_class"] + 1)) loss_logger = VisdomPlotLogger('line', port=port, opts={ 'title':
# Set the random seed for reproducible experiments seed = 42 torch.manual_seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True np.random.seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) # Set the logger log_dir = os.path.join(args.model_dir_transfer, "logs") if not os.path.exists(log_dir): print("Making log directory {}".format(log_dir)) os.mkdir(log_dir) utils.set_logger(os.path.join(log_dir, "train.log")) # Create the input data pipeline logging.info("Loading the datasets...") # fetch dataloaders params_transfer.encoding = params_transfer.encoding_source train_dl = dataloader.fetch_dataloader( args.data_dir, args.txt_train, 'train', params_transfer) val_dl_source = dataloader.fetch_dataloader( args.data_dir, args.txt_val_source, 'val', params_transfer) params_transfer.encoding = params_transfer.encoding_target val_dl_target = dataloader.fetch_dataloader( args.data_dir, args.txt_val_target, 'val', params_transfer)
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Chinese NER Task') parser.add_argument('--model', type=str, required=True, help='choose a model: bilstm, bilstm_crf, hmm,cnn') args = parser.parse_args() model_name = args.model import_model = import_module('model.' + model_name) config = import_model.Config() random_seed(config.seed) set_logger(config.logging_dir) # load data if args.model == 'hmm': processor = HMMDataProcessor(config.data_dir, config.do_lower_case) else: processor = DataProcessor(config.data_dir, config.do_lower_case) train_examples = processor.get_train_examples() config.train_num_examples = len(train_examples) dev_examples = processor.get_dev_examples() config.dev_num_examples = len(dev_examples) test_examples = processor.get_test_examples() config.test_num_examples = len(test_examples) config.label_list = processor.get_tagging() config.num_label = len(config.label_list)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parser.parse_args() set_logger(log_file=args.log_file, debug_mode=args.debug_mode) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) cudnn.benchmark = True train_loader = VideoIterTrain( dataset_path=args.dataset_path, annotation_path=args.annotation_path, clip_length=args.clip_length, frame_stride=args.train_frame_interval, video_transform=build_transforms(), name='train', return_item_subpath=False, ) train_iter = torch.utils.data.DataLoader( train_loader, batch_size=args.batch_size, shuffle=False, num_workers=32, # 4, # change this part accordingly pin_memory=True) val_loader = VideoIterTrain( dataset_path=args.dataset_path, annotation_path=args.annotation_path_test, clip_length=args.clip_length, frame_stride=args.val_frame_interval, video_transform=build_transforms(), name='val', return_item_subpath=False, ) val_iter = torch.utils.data.DataLoader( val_loader, batch_size=args.batch_size, shuffle=False, num_workers=32, # 4, # change this part accordingly pin_memory=True) network = C3D(pretrained=args.pretrained_3d) network.to(device) if not path.exists(features_dir): mkdir(features_dir) features_writer = FeaturesWriter() for i_batch, (data, target, sampled_idx, dirs, vid_names) in tqdm(enumerate(train_iter)): with torch.no_grad(): outputs = network(data.cuda()) for i, (dir, vid_name, start_frame) in enumerate( zip(dirs, vid_names, sampled_idx.cpu().numpy())): dir = path.join(features_dir, dir) features_writer.write(feature=outputs[i], video_name=vid_name, start_frame=start_frame, dir=dir) features_writer.dump() features_writer = FeaturesWriter() for i_batch, (data, target, sampled_idx, dirs, vid_names) in tqdm(enumerate(val_iter)): with torch.no_grad(): outputs = network(data.cuda()) for i, (dir, vid_name, start_frame) in enumerate( zip(dirs, vid_names, sampled_idx.cpu().numpy())): dir = path.join(features_dir, dir) features_writer.write(feature=outputs[i], video_name=vid_name, start_frame=start_frame, dir=dir) features_writer.dump()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default='data/', help="Directory to dataset") parser.add_argument('--experiment_dir', default='experiments/base_model', help="Directory to experiment") parser.add_argument( '--checkpoint_dir', help="Reload weights from .pth.tar file ('best' or 'last')") args = parser.parse_args() # Set the logger utils.set_logger(os.path.join(args.experiment_dir, 'train.log')) # Import configs config_dir = args.experiment_dir + '/config.yaml' cfg.merge_from_file(config_dir) cfg.freeze() # Fix seed tf.random.set_seed(0) # Fetch dataloaders logging.info("Loading datasets...\n") train_dl, test_dl = dataloaders.load_mnist() # Load model, optimizer, loss
def main(args): model_prefix = '{}_{}'.format(args.model_type, args.train_id) log_path = args.LOG_DIR + model_prefix + '/' checkpoint_path = args.CHK_DIR + model_prefix + '/' result_path = args.RESULT_DIR + model_prefix + '/' cp_file = checkpoint_path + "best_model.pth.tar" init_epoch = 0 if not os.path.exists(log_path): os.makedirs(log_path) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) ## set up the logger set_logger(os.path.join(log_path, 'train.log')) ## save argparse parameters with open(log_path + 'args.yaml', 'w') as f: for k, v in args.__dict__.items(): f.write('{}: {}\n'.format(k, v)) logging.info('Training model: {}'.format(model_prefix)) ## set up vocab txt setup(args, clear=True) print(args.__dict__) # indicate src and tgt language en, zh = 'en', 'zh' maps = {'en': args.TRAIN_VOCAB_EN, 'zh': args.TRAIN_VOCAB_ZH} vocab_en = read_vocab(maps[en]) tok_en = Tokenizer(language=en, vocab=vocab_en, encoding_length=args.MAX_INPUT_LENGTH) vocab_zh = read_vocab(maps[zh]) tok_zh = Tokenizer(language=zh, vocab=vocab_zh, encoding_length=args.MAX_INPUT_LENGTH) logging.info('Vocab size en/zh:{}/{}'.format(len(vocab_en), len(vocab_zh))) ## Setup the training, validation, and testing dataloaders train_loader, val_loader, test_loader = create_split_loaders( args.DATA_DIR, (tok_en, tok_zh), args.batch_size, args.MAX_VID_LENGTH, (en, zh), num_workers=4, pin_memory=True) logging.info('train/val/test size: {}/{}/{}'.format( len(train_loader), len(val_loader), len(test_loader))) ## init model encoder = Encoder(embed_size=args.wordembed_dim, hidden_size=args.enc_hid_size).cuda() decoder = Decoder(embed_size=args.wordembed_dim, hidden_size=args.dec_hid_size, vocab_size_en=len(vocab_en), vocab_size_zh=len(vocab_zh)).cuda() encoder.train() decoder.train() ## define loss criterion = nn.CrossEntropyLoss(ignore_index=padding_idx).cuda() ## init optimizer dec_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()), lr=args.decoder_lr, weight_decay=args.weight_decay) enc_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=args.encoder_lr, weight_decay=args.weight_decay) count_paras(encoder, decoder, logging) ## track loss during training total_train_loss, total_val_loss = [], [] best_val_bleu_en, best_val_bleu_zh, best_epoch = 0, 0, 0 ## init time zero_time = time.time() # Begin training procedure earlystop_flag = False rising_count = 0 for epoch in range(init_epoch, args.epochs): ## train for one epoch start_time = time.time() train_loss = train(train_loader, encoder, decoder, criterion, enc_optimizer, dec_optimizer, epoch) val_loss, sentbleu_en, corpbleu_en = validate(val_loader, encoder, decoder, criterion, tok_en, tok_zh) end_time = time.time() epoch_time = end_time - start_time total_time = end_time - zero_time logging.info( 'Total time used: %s Epoch %d time uesd: %s train loss: %.4f val loss: %.4f corpbleu-en: %.4f' % (str(datetime.timedelta(seconds=int(total_time))), epoch, str(datetime.timedelta(seconds=int(epoch_time))), train_loss, val_loss, corpbleu_en)) if corpbleu_en > best_val_bleu_en: best_val_bleu_en = corpbleu_en save_checkpoint( { 'epoch': epoch, 'enc_state_dict': encoder.state_dict(), 'dec_state_dict': decoder.state_dict(), 'enc_optimizer': enc_optimizer.state_dict(), 'dec_optimizer': dec_optimizer.state_dict(), }, cp_file) best_epoch = epoch logging.info("Finished {0} epochs of training".format(epoch + 1)) total_train_loss.append(train_loss) total_val_loss.append(val_loss) logging.info('Best corpus bleu score en-{:.4f} at epoch {}'.format( best_val_bleu_en, best_epoch)) ### the best model is the last model saved in our implementation logging.info('************ Start eval... ************') eval(test_loader, encoder, decoder, cp_file, tok_en, tok_zh, result_path)
args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = utils.Params(json_path) # use GPU if available params.cuda = torch.cuda.is_available() # use GPU is available # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Get the logger utils.set_logger(os.path.join(args.model_dir, 'test.log')) # Create the input data pipeline logging.info("Creating the dataset...") # fetch dataloaders dataloaders = data_loader.fetch_data_loader(['test'], args.data_dir, params) test_dl = dataloaders['test'] logging.info("getting the test dataloader - done.") # Define the model model = Net().cuda() if params.cuda else Net() loss_fn = loss_fn
def run_training(train_dl, val_dl, multi_gpu=[0, 1]): set_logger(LOG_PATH) logging.info('\n\n') #--- if MODEL == 'UNetResNet34': net = UNetResNet34(debug=False).cuda(device=device) # for param in net.named_parameters(): # if param[0][:8] in ['decoder5']:#'decoder5', 'decoder4', 'decoder3', 'decoder2' # param[1].requires_grad = False train_params = filter(lambda p: p.requires_grad, net.parameters()) optimizer = torch.optim.SGD(train_params, momentum=0.9, weight_decay=0.0001, lr=LearningRate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) if warm_start: logging.info('warm_start: '+last_checkpoint_path) net, _ = load_checkpoint(last_checkpoint_path, net) # using multi GPU if multi_gpu is not None: net = nn.DataParallel(net, device_ids=multi_gpu) diff = 0 best_val_metric = -0.1 optimizer.zero_grad() for i_epoch in range(NUM_EPOCHS): ## adjust learning rate #scheduler.step(epoch=i_epoch) #print('lr: %f'%scheduler.get_lr()[0]) t0 = time.time() # iterate through trainset if multi_gpu is not None: net.module.set_mode('train') else: net.set_mode('train') train_loss_list, train_metric_list = [], [] for i, (image, masks) in enumerate(train_dl): input_data = image.to(device=device, dtype=torch.float) truth = masks.to(device=device, dtype=torch.float) #set_trace() logit = net(input_data) if multi_gpu is not None: _train_loss = net.module.criterion(logit, truth) _train_metric = net.module.metric(logit, truth) else: _train_loss = net.criterion(logit, truth) _train_metric = net.metric(logit, truth) train_loss_list.append(_train_loss.item()) train_metric_list.append(_train_metric.item()) #grandient accumulation step=2 acc_step = GradientAccStep _train_loss = _train_loss / acc_step _train_loss.backward() if (i+1)%acc_step==0: optimizer.step() optimizer.zero_grad() train_loss = np.mean(train_loss_list) train_metric = np.mean(train_metric_list) # compute valid loss & metrics (concatenate valid set in cpu, then compute loss, metrics on full valid set) net.module.set_mode('valid') with torch.no_grad(): val_loss_list, val_metric_list = [], [] for i, (image, masks) in enumerate(val_dl): input_data = image.to(device=device, dtype=torch.float) truth = masks.to(device=device, dtype=torch.float) logit = net(input_data) if multi_gpu is not None: _val_loss = net.module.criterion(logit, truth) _val_metric = net.module.metric(logit, truth) else: _val_loss = net.criterion(logit, truth) _val_metric = net.metric(logit, truth) val_loss_list.append(_val_loss.item()) val_metric_list.append(_val_metric.item()) val_loss = np.mean(val_loss_list) val_metric = np.mean(val_metric_list) # Adjust learning_rate scheduler.step(val_metric) #force to at least train N epochs if i_epoch>=-1: if val_metric > best_val_metric: best_val_metric = val_metric is_best = True diff = 0 else: is_best = False diff += 1 if diff > early_stopping_round: logging.info('Early Stopping: val_metric does not increase %d rounds'%early_stopping_round) break else: is_best = False #save checkpoint checkpoint_dict = \ { 'epoch': i, 'state_dict': net.module.state_dict() if multi_gpu is not None else net.state_dict(), 'optim_dict' : optimizer.state_dict(), 'metrics': {'train_loss': train_loss, 'val_loss': val_loss, 'train_metric': train_metric, 'val_metric': val_metric} } save_checkpoint(checkpoint_dict, is_best=is_best, checkpoint=checkpoint_path) #if i_epoch%20==0: if i_epoch>-1: logging.info('[EPOCH %05d]train_loss, train_metric: %0.5f, %0.5f; val_loss, val_metric: %0.5f, %0.5f; time elapsed: %0.1f min'%(i_epoch, train_loss.item(), train_metric.item(), val_loss.item(), val_metric.item(), (time.time()-t0)/60))
#for reproducibility tf.compat.v1.set_random_seed(123) args = parser.parse_args() params_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( params_path), "params.json does not exits at {}".format(params_path) params = Params(params_path) params.load(params.update) #TODO: check and load if there's the best weights so far # model_dir_has_best_weights = os.path.isdir(os.path.join(args.model_dir, "best_weights")) #set logger set_logger(os.path.join(args.model_dir, 'train.log')) #train/test split train_fpaths, test_fpaths, train_targets, test_targets = \ get_train_test_split(args.json_path, args.data_dir, train_size=args.train_size) params.train_size = len(train_fpaths) params.test_size = len(test_fpaths) logging.info("Creating the dataset...") train_inputs = input_fn(True, train_fpaths, train_targets, params) test_inputs = input_fn(False, test_fpaths, test_targets, params) logging.info("Creating the model...") train_model_spec = model_fn(True, train_inputs, params) test_model_spec = model_fn(False, test_inputs, params, reuse=True)
def predict(inp, target, params, restore_from, config=None,\ model_dir='./ie590_project/experiments/ex1', model_save_dir='./ie590_project/experiments/ex1/model_save/1'): """predict target values given input file paths Args: inp: (list) a string list of image files paths; 2D -> [sample_size, number_of_channels] model_spec: (dict) model specifications of tf Ops params: (Params or str) Params object or params.joson path tar: (list) a float list of target values restore_from: (str) ckpt or directory name where ckpts are located for restoring ... Return: out: (list) a list of precicted target values; have exactly same dimension as target """ assert len(inp) == len(target) iterator_init_op = model_spec['iterator_init_op'] update_metrics_op = model_spec['update_metrics_op'] metrics = model_spec['metrics'] metrics_init_op = model_spec['metrics_init_op'] predictions = model_spec['predictions'] saver = tf.compat.v1.train.Saver() if type(params) is str: assert os.path.isfile( params), "params.json does not exits at {}".format(params) params = Params(params) params.load(params.update) # load parameters params.inp_size = len(inp) set_logger(os.path.join(model_dir, 'train.log')) logging.info("Creating the dataset...") inputs = input_fn(False, inp, target, params) logging.info("Creating the model...") model_spec = model_fn(False, inputs, params) logging.info("Calculating predictions...") with tf.compat.v1.Session(config=config) as sess: sess.run(model_spec['variable_init_op']) save_path = os.path.join(model_save_dir, restore_from) if os.path.isdir(save_path): save_path = tf.train.latest_checkpoint( save_path ) # If restore_from is a directory, get the latest ckpt saver.restore(sess, save_path) num_steps = (params.inp_size + params.batch_size - 1) // params.batch_size sess.run([iterator_init_op, metrics_init_op]) if len(np.shape(target)) == 1: out = np.empty(np.shape(target))[:, np.newaxis] else: out = np.empty(np.shape(target)) for i in tqdm(range(num_steps)): _, predictions_eval = sess.run([update_metrics_op, predictions]) if i < num_steps - 1: out[i * params.batch_size:(i + 1) * params.batch_size, :] = predictions_eval else: out[i * params.batch_size:, :] = predictions_eval return out
def set_logger(self): self.log = utils.set_logger(out_folder=self.config['out_folder'], name="sub_log") self.log.info(f"{self.config}")
def train(sess, args): config_path = 'model_{}/config.json'.format(args.model_num) with open(config_path, 'r') as f: config = json.load(f) patch_size = config['patch_size'] batch_size = config['batch_size'] num_steps = config['num_steps'] quan_scale = config['quan_scale'] overload_log = args.overload_log load_ckpt = args.load_ckpt if load_ckpt == 'on' or overload_log == 'off': utils.set_logger(log_path) logging.info('Not overload_log') else: utils.set_logger(log_path, mode='w') logging.info('Overload_log') global global_var additional_param = args.additional_param if additional_param == '0': pass elif additional_param == '1': pass elif additional_param == '2': pass elif additional_param == '3': pass data_batch, handle_placeholder, train_handle, valid_handle, valid_iterator = data_loader.get_train_and_valid_data_batch(sess, train_data_list, valid_data_list, batch_size, flip_ud=False, flip_lr=False, rot_90=False) # print(sess.run(data_batch)) # return # Avoid summary info logging.getLogger().setLevel(logging.WARNING) output_train = encoder(data_batch, patch_size, quan_scale, is_training=True) output_train = decoder(output_train, quan_scale, is_training=True) loss_op_train = get_loss(data_batch, output_train) train_op, global_step_op, learning_rate_op = optimize(loss_op_train, config) # ----- output_eval = encoder(data_batch, patch_size, quan_scale, is_training=False, reuse=True) output_eval = decoder(output_eval, quan_scale, is_training=False, reuse=True) loss_op_eval = get_loss(data_batch, output_eval) #----- saver = tf.train.Saver() if load_ckpt == 'on': saver.restore(sess, model_param_path) logging.info('Load previous params') else: variable_init = tf.global_variables_initializer() sess.run(variable_init) # saver.save(sess, model_param_path) # logging.info('Model paremeters saved to: {}'.format(model_param_path)) # return utils.add_trainable_variables_to_summary() if args.summary_save == 'on': summary_writer = tf.summary.FileWriter(summary_path, sess.graph) merged_summaries = tf.summary.merge_all() options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() many_runs_timeline = utils.TimeLiner() # Avoid summary info logging.getLogger().setLevel(logging.INFO) logging.info('-----') logging.info(args) logging.info(config) logging.info('-----') # debug mode if args.debug_mode == 'on': logging.info('-----') logging.info('Debug mode') logging.info('-----') return train_loss_display_step = 200 valid_loss_display_step = 20000 global_step = sess.run(global_step_op) # normal train for step in range(global_step + 1, num_steps + 1): if step % train_loss_display_step == 0: if args.summary_save == 'on': _, loss, global_step, learning_rate_value, summary_value = sess.run([train_op, loss_op_train, global_step_op, learning_rate_op, merged_summaries], feed_dict={handle_placeholder: train_handle}, options=options, run_metadata=run_metadata) else: _, loss, global_step, learning_rate_value = sess.run([train_op, loss_op_train, global_step_op, learning_rate_op], feed_dict={handle_placeholder: train_handle}, options=options, run_metadata=run_metadata) logging.info('Step: {:d}, loss: {:.8f}, lr: {:.8f}'.format(global_step, loss, learning_rate_value)) if step % valid_loss_display_step == 0: sess.run(valid_iterator.initializer) [valid_loss] = sess.run([loss_op_eval], feed_dict={handle_placeholder: valid_handle}, options=options, run_metadata=run_metadata) logging.info('Valid loss: {:.8f}'.format(valid_loss)) if args.param_save == 'on': saver.save(sess, model_param_path) # logging.info('Model paremeters saved to: {}'.format(model_param_path)) if args.summary_save == 'on': summary_writer.add_summary(summary_value, global_step=global_step) # logging.info('Summaries saved to: {}'.format(summary_path)) else: _, loss, global_step = sess.run([train_op, loss_op_train, global_step_op], feed_dict={handle_placeholder: train_handle}, options=options, run_metadata=run_metadata) if args.timeline_save == 'on': many_runs_timeline.update_timeline(run_metadata.step_stats) # logging.info('{}_{}'.format(step, global_step)) if args.timeline_save == 'on': many_runs_timeline.save(timeline_path) logging.info('Timeline saved to: {}'.format(timeline_path))
if params.gpu_id >= -1: params.cuda = True # Set the random seed for reproducible experiments torch.manual_seed(params.seed) np.random.seed(params.seed) random.seed(params.seed) if params.gpu_id >= -1: torch.cuda.manual_seed(params.seed) torch.backends.cudnn.deterministic = False # must be True to if you want reproducible,but will slow the speed cudnn.benchmark = True # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 torch.cuda.empty_cache() # release cache # Set the logger if args.targeted: utils.set_logger(os.path.join(experiment_path, 'beta_' + str(int(args.beta)) + str(int(args.beta*10)) + '_target_'+str(args.target_label) + '_attack.log')) else: utils.set_logger(os.path.join(experiment_path, 'beta_' + str(int(args.beta)) + str(int(args.beta*10)) + '_untarget_' + 'attack.log')) logger = logging.getLogger() port,env = 8097,params.model_version columnnames,rownames = list(range(1,params.model_args["num_class"]+1)),list(range(1,params.model_args["num_class"]+1)) # log all params d_args = vars(args) for k in d_args.keys(): logging.info('{0}: {1}'.format(k, d_args[k])) d_params = vars(params) for k in d_params.keys():
def run_check_net(train_dl, val_dl, multi_gpu=[0, 1]): set_logger(LOG_PATH) logging.info('\n\n') #--- if MODEL == 'UNetResNet34': net = UNetResNet34(debug=False).cuda(device=device) #elif MODEL == 'RESNET18': # net = AtlasResNet18(debug=False).cuda(device=device) # for param in net.named_parameters(): # if param[0][:8] in ['decoder5']:#'decoder5', 'decoder4', 'decoder3', 'decoder2' # param[1].requires_grad = False # dummy sgd to see if it can converge ... #optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), # lr=LearningRate, momentum=0.9, weight_decay=0.0001) #optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.045)#LearningRate #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', # factor=0.5, patience=4,#4 resnet34 # verbose=False, threshold=0.0001, # threshold_mode='rel', cooldown=0, # min_lr=0, eps=1e-08) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.9, last_epoch=-1) train_params = filter(lambda p: p.requires_grad, net.parameters()) optimizer = torch.optim.SGD(train_params, momentum=0.9, weight_decay=0.0001, lr=LearningRate) scheduler = LR_Scheduler( 'poly', LearningRate, NUM_EPOCHS, len(train_dl)) #lr_scheduler=['poly', 'step', 'cos'] if warm_start: logging.info('warm_start: ' + last_checkpoint_path) net, _ = load_checkpoint(last_checkpoint_path, net) # using multi GPU if multi_gpu is not None: net = nn.DataParallel(net, device_ids=multi_gpu) #use sync_batchnorm #net = convert_model(net) diff = 0 best_val_metric = -0.1 optimizer.zero_grad() #seed = get_seed() #seed = SEED #logging.info('aug seed: '+str(seed)) #ia.imgaug.seed(seed) #np.random.seed(seed) for i_epoch in range(NUM_EPOCHS): t0 = time.time() # iterate through trainset if multi_gpu is not None: net.module.set_mode('train') else: net.set_mode('train') train_loss_list = [] #train_metric_list #logit_list, truth_list = [], [] for i, (images, masks) in enumerate(train_dl): ## adjust learning rate scheduler(optimizer, i, i_epoch, best_val_metric) input_data = images.to(device=device, dtype=torch.float) #1 for non-zero-mask truth = (torch.sum(masks.reshape(masks.size()[0], masks.size()[1], -1), dim=2, keepdim=False) != 0).to(device=device, dtype=torch.float) logit = net(input_data) #logit_list.append(logit) #truth_list.append(truth) if multi_gpu is not None: _train_loss = net.module.criterion(logit, truth) #_train_metric = net.module.metric(logit, truth)#device='gpu' else: _train_loss = net.criterion(logit, truth) #_train_metric = net.metric(logit, truth)#device='gpu' train_loss_list.append(_train_loss.item()) #train_metric_list.append(_train_metric.item())#.detach() #grandient accumulation step=2 acc_step = 1 _train_loss = _train_loss / acc_step _train_loss.backward() if (i + 1) % acc_step == 0: optimizer.step() optimizer.zero_grad() train_loss = np.mean(train_loss_list) #train_metric = np.mean(train_metric_list) # if multi_gpu is not None: # train_metric, train_tn, train_fp, train_fn, train_tp, train_auc, train_pos_percent = net.module.metric(torch.cat(logit_list, dim=0), torch.cat(truth_list, dim=0)) # else: # train_metric, train_tn, train_fp, train_fn, train_tp, train_auc, train_pos_percent = net.metric(torch.cat(logit_list, dim=0), torch.cat(truth_list, dim=0)) # compute valid loss & metrics (concatenate valid set in cpu, then compute loss, metrics on full valid set) net.module.set_mode('valid') with torch.no_grad(): # val_loss_list, val_metric_list = [], [] # for i, (image, masks) in enumerate(val_dl): # input_data = image.to(device=device, dtype=torch.float) # truth = masks.to(device=device, dtype=torch.float) # logit = net(input_data) # if multi_gpu is not None: # _val_loss = net.module.criterion(logit, truth) # _val_metric = net.module.metric(logit, truth)#device='gpu' # else: # _val_loss = net.criterion(logit, truth) # _val_metric = net.metric(logit, truth)#device='gpu' # val_loss_list.append(_val_loss.item()) # val_metric_list.append(_val_metric.item())#.detach() # val_loss = np.mean(val_loss_list) # val_metric = np.mean(val_metric_list) logit_valid, truth_valid = None, None for j, (images, masks) in enumerate(val_dl): input_data = images.to(device=device, dtype=torch.float) #1 for non-zero-mask truth = (torch.sum(masks.reshape(masks.size()[0], masks.size()[1], -1), dim=2, keepdim=False) != 0).to(device=device, dtype=torch.float) logit = net(input_data) if logit_valid is None: logit_valid = logit truth_valid = truth else: logit_valid = torch.cat((logit_valid, logit), dim=0) truth_valid = torch.cat((truth_valid, truth), dim=0) if multi_gpu is not None: val_loss = net.module.criterion(logit_valid, truth_valid) _, val_metric, val_tn, val_fp, val_fn, val_tp, val_pos_percent = net.module.metric( logit_valid, truth_valid) else: val_loss = net.criterion(logit_valid, truth_valid) _, val_metric, val_tn, val_fp, val_fn, val_tp, val_pos_percent = net.metric( logit_valid, truth_valid) # Adjust learning_rate #scheduler.step(val_metric) # if i_epoch >= 30: if val_metric > best_val_metric: best_val_metric = val_metric is_best = True diff = 0 else: is_best = False diff += 1 if diff > early_stopping_round: logging.info( 'Early Stopping: val_metric does not increase %d rounds' % early_stopping_round) #print('Early Stopping: val_iou does not increase %d rounds'%early_stopping_round) break else: is_best = False #save checkpoint checkpoint_dict = \ { 'epoch': i_epoch, 'state_dict': net.module.state_dict() if multi_gpu is not None else net.state_dict(), 'optim_dict' : optimizer.state_dict(), 'metrics': {'train_loss': train_loss, 'val_loss': val_loss, 'val_metric': val_metric} } save_checkpoint(checkpoint_dict, is_best=is_best, checkpoint=checkpoint_path) #if i_epoch%20==0: if i_epoch > -1: logging.info( '[EPOCH %05d]train_loss: %0.5f; val_loss, val_metric: %0.5f, %0.5f' % (i_epoch, train_loss.item(), val_loss.item(), val_metric)) logging.info('val_pos_percent: %.3f' % (val_pos_percent)) logging.info('val (tn, fp, fn, tp): %d, %d, %d, %d' % (val_tn, val_fp, val_fn, val_tp)) logging.info('time elapsed: %0.1f min' % ((time.time() - t0) / 60))
def train(sess, args): load_ckpt = args.load_ckpt batch_size = 64 num_steps = 300000 boundaries = [200000, 300000] lr_values = [1e-4, 1e-5, 1e-6] log_path = str(cur_file_dir / 'train.log') model_param_dir = str(cur_file_dir / 'params') Path(model_param_dir).mkdir(parents=True, exist_ok=True) model_param_path = model_param_dir + '/params' if load_ckpt == 'on': utils.set_logger(log_path) logging.info('Not overload_log') else: utils.set_logger(log_path, mode='w') logging.info('Overload_log') train_data_list = 'data_info/recons_train_data_patch_list.txt' valid_data_list = 'data_info/recons_valid_data_patch_list.txt' # train_data_list = 'data_info/train_data_patch_list_128.txt' # valid_data_list = 'data_info/valid_data_patch_list_128.txt' data_batch, handle_placeholder, train_handle, valid_handle, valid_iterator = get_train_and_valid_data_batch( sess, train_data_list, valid_data_list, batch_size) # print(sess.run(data_batch)) # return # Avoid summary info logging.getLogger().setLevel(logging.WARNING) recons_data, ori_data = data_batch # # ----- # variable_init = tf.global_variables_initializer() # sess.run(variable_init) # ori_data_value = sess.run(data_batch, feed_dict={handle_placeholder: train_handle}) # # print(type(recons_data_value)) # print(type(ori_data_value)) # # print(recons_data_value.shape) # print(ori_data_value[0].shape, ori_data_value[1].shape) # return output = model(recons_data) loss_op = get_loss(ori_data, output) train_op, global_step_op, learning_rate_op = optimize( loss_op, boundaries, lr_values) saver = tf.train.Saver() if load_ckpt == 'on': saver.restore(sess, model_param_path) logging.info('Load previous params') else: variable_init = tf.global_variables_initializer() sess.run(variable_init) # saver.save(sess, model_param_path) # logging.info('Model paremeters saved to: {}'.format(model_param_path)) # return # Avoid summary info logging.getLogger().setLevel(logging.INFO) logging.info('-----') logging.info(args) logging.info('-----') train_loss_display_step = 200 valid_loss_display_step = 20000 global_step = sess.run(global_step_op) # normal train for step in range(global_step + 1, num_steps + 1): _, loss, global_step, learning_rate_value = sess.run( [train_op, loss_op, global_step_op, learning_rate_op], feed_dict={handle_placeholder: train_handle}) if step % train_loss_display_step == 0: logging.info('Step: {:d}, loss: {:.8f}, lr: {:.8f}'.format( global_step, loss, learning_rate_value)) if step % valid_loss_display_step == 0: sess.run(valid_iterator.initializer) [valid_loss ] = sess.run([loss_op], feed_dict={handle_placeholder: valid_handle}) logging.info('Valid loss: {:.8f}'.format(valid_loss)) saver.save(sess, model_param_path)
def train(sess, args): config_path = 'model_{}/config.json'.format(args.model_num) with open(config_path, 'r') as f: config = json.load(f) patch_size = config['patch_size'] batch_size = config['batch_size'] num_steps = config['num_steps'] quan_scale = config['quan_scale'] bitrate_reg_decay = config['bitrate_reg_decay'] overload_log = args.overload_log load_ckpt = args.load_ckpt reset_step = args.reset_step max_step = args.max_step lr_and_bound = args.lr_and_bound if max_step != 'None': num_steps = int(max_step) print('num_steps: ', num_steps) if (reset_step == 'off') and (load_ckpt == 'on' or overload_log == 'off'): utils.set_logger(log_path) logging.info('Not overload_log') else: utils.set_logger(log_path, mode='w') logging.info('Overload_log') global global_var additional_param = args.additional_param if additional_param == '0': pass elif additional_param == '1': pass elif additional_param == '2': pass elif additional_param == '3': pass train_data_list = train_data_list_tpl.format(patch_size) valid_data_list = valid_data_list_tpl.format(patch_size) data_batch, handle_placeholder, train_handle, valid_handle, valid_iterator = data_loader.get_train_and_valid_data_batch( sess, train_data_list, valid_data_list, batch_size, flip_ud=False, flip_lr=False, rot_90=False) # print(sess.run(data_batch)) # return # Avoid summary info logging.getLogger().setLevel(logging.WARNING) output = encoder(data_batch, patch_size, quan_scale) output = decoder(output, quan_scale) loss_op = get_loss(data_batch, output, bitrate_reg_decay) boundaries = config['boundaries'] lr_values = config['lr_values'] if lr_and_bound != 'None': start_lr = float(lr_and_bound.split(',')[0]) bound = lr_and_bound.split(',')[1:] boundaries = [int(item) for item in boundaries] lr_values = [start_lr, start_lr / 10, start_lr / 100] print('boundaries: {}, lr_values: {}'.format(boundaries, lr_values)) train_op, global_step_op, learning_rate_op = optimize( loss_op, boundaries, lr_values) saver = tf.train.Saver() if load_ckpt == 'on': saver.restore(sess, model_param_path) logging.info('Load previous params') else: variable_init = tf.global_variables_initializer() sess.run(variable_init) # saver.save(sess, model_param_path) # logging.info('Model paremeters saved to: {}'.format(model_param_path)) # return utils.add_trainable_variables_to_summary() if args.summary_save == 'on': summary_writer = tf.summary.FileWriter(summary_path, sess.graph) merged_summaries = tf.summary.merge_all() options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() many_runs_timeline = utils.TimeLiner() # Avoid summary info logging.getLogger().setLevel(logging.INFO) logging.info('-----') logging.info(args) logging.info(config) logging.info('-----') # debug mode if args.debug_mode == 'on': logging.info('-----') logging.info('Debug mode') logging.info('-----') return train_loss_display_step = 200 valid_loss_display_step = 20000 if reset_step == 'on': assign_op = tf.assign(global_step_op, 0) sess.run(assign_op) global_step = sess.run(global_step_op) # normal train for step in range(global_step + 1, num_steps + 1): if step % train_loss_display_step == 0: if args.summary_save == 'on': _, loss, global_step, learning_rate_value, summary_value = sess.run( [ train_op, loss_op, global_step_op, learning_rate_op, merged_summaries ], feed_dict={handle_placeholder: train_handle}, options=options, run_metadata=run_metadata) else: _, loss, global_step, learning_rate_value = sess.run( [train_op, loss_op, global_step_op, learning_rate_op], feed_dict={handle_placeholder: train_handle}, options=options, run_metadata=run_metadata) logging.info('Step: {:d}, loss: {:.8f}, lr: {:.8f}'.format( global_step, loss, learning_rate_value)) if step % valid_loss_display_step == 0: sess.run(valid_iterator.initializer) [valid_loss ] = sess.run([loss_op], feed_dict={handle_placeholder: valid_handle}, options=options, run_metadata=run_metadata) logging.info('Valid loss: {:.8f}'.format(valid_loss)) if args.param_save == 'on': saver.save(sess, model_param_path) # logging.info('Model paremeters saved to: {}'.format(model_param_path)) if args.summary_save == 'on': summary_writer.add_summary(summary_value, global_step=global_step) # logging.info('Summaries saved to: {}'.format(summary_path)) else: _, loss, global_step = sess.run( [train_op, loss_op, global_step_op], feed_dict={handle_placeholder: train_handle}, options=options, run_metadata=run_metadata) if args.timeline_save == 'on': many_runs_timeline.update_timeline(run_metadata.step_stats) # logging.info('{}_{}'.format(step, global_step)) if args.timeline_save == 'on': many_runs_timeline.save(timeline_path) logging.info('Timeline saved to: {}'.format(timeline_path))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--params', help='Directory containing params.json') args = parser.parse_args() params = utils.Params(args.params) # pull out lr and decay for easy access learning_rate = params.learning_rate decay = params.decay #Set the random seed for reproducible experiments torch.manual_seed(230) experiment = Experiment(api_key=params.comet_api, project_name=params.comet_name, workspace="ayeaton") # Set the logger utils.set_logger(os.path.join(params.implementation_dir, 'train.log')) logging.info(experiment) log_params = { "learning_rate": learning_rate, "decay": decay, "batch_size": params.batch_size, "dropout_rate": params.dropout_rate, "model": params.model, "optimizer": params.optimizer, "loss_func": params.loss_func, "classes": params.classes, "metadata_file": params.metadata_file, "model_dir": params.model_dir, "implementation_dir": params.implementation_dir } experiment.log_parameters(log_params) # Create the input data pipeline logging.info("Loading the datasets...") # get data train_dataset = dataset(file_path=params.metadata_file, split="Train", classes=params.classes) train_loader = DataLoader(dataset=train_dataset, batch_size=params.batch_size, shuffle=True, num_workers=8) val_dataset = dataset(file_path=params.metadata_file, split="Val", classes=params.classes) val_loader = DataLoader(dataset=val_dataset, batch_size=params.batch_size, shuffle=True, num_workers=8) logging.info("- done.") # Define the model and optimizer if params.model != "Inception": net = importlib.import_module("models.{}".format(params.model)) model = net.Net() inception = False else: model = models.inception_v3(pretrained=False) model.fc = nn.Linear(2048, len(params.classes)) model.AuxLogits.fc = nn.Linear(768, 1) inception = True logging.info("Model -- {}".format(repr(model))) model.cuda() # fetch loss function and metrics metrics_save = metrics_code.metrics_save # Train the model logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) train_and_evaluate(model, train_loader, val_loader, metrics_save, params.implementation_dir, params.num_epochs, params.loss_func, params.optimizer, learning_rate, decay, params.save_summary_steps, experiment, inception)
args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = utils.Params(json_path) # use GPU if available params.cuda = torch.cuda.is_available() # use GPU is available # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Get the logger utils.set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") # fetch dataloaders dataloaders = data_loader.fetch_data_loader(['val'], args.data_dir, params) test_dl = dataloaders['val'] logging.info("getting the test dataloader - done.") # Define the model model = PhdGifNet().cuda() if params.cuda else PhdGifNet() loss_fn = loss_fn metrics = metrics
from train.network import Network from utils.parser import parser from utils.utils import Config, set_logger, prepare_paths log = logging.getLogger('main') if __name__ == '__main__': # Parsing arguments and set configs args = parser.parse_args() cfg = Config.init_from_parsed_args(args) # Set all the paths prepare_paths(cfg) # Logger set_logger(cfg) log = logging.getLogger('main') # Preprocessing & make dataset # if cfg.data_name = 'pos': # vocab = process_main_corpus(cfg, 'split') # vocab_pos = process_pos_corpus(cfg, 'split') # corpus = CorpusPOSDataset(cfg.processed_train_path, # cfg.pos_data_path) if cfg.pos_tag: vocab, vocab_tag = process_corpus_tag(cfg) corpus_train = CorpusPOSDataset(cfg.processed_train_path, cfg.pos_data_path) else: vocab = process_main_corpus(cfg)
def run_check_net(train_dl, val_dl, multi_gpu=[0, 1]): set_logger(LOG_PATH) logging.info('\n\n') #--- net = EfficientNet.from_name( MODEL, debug=debug) #override_params={'num_classes': 1} net.load_state_dict(torch.load(glob.glob('../model/%s*' % MODEL)[0])) in_features = net._fc.in_features net._fc = nn.Linear(in_features, 4) #num_classes=1 net = net.cuda(device=device) print(glob.glob('../model/%s*' % MODEL)[0]) print('====Loading pretrained weights done====') if FREEZE: print('====FREEZE parameters====') #freeze up to last layer for the first 5 epochs for k, v in net.named_parameters(): #print(k) if k in ['_fc.weight', '_fc.bias']: v.requires_grad = True print('only train layer: ', k) else: v.requires_grad = False else: print('====not FREEZE parameters====') # dummy sgd to see if it can converge ... #optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), # lr=LearningRate, momentum=0.9, weight_decay=0.0001) #optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.045)#LearningRate #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', # factor=0.5, patience=4,#4 resnet34 # verbose=False, threshold=0.0001, # threshold_mode='rel', cooldown=0, # min_lr=0, eps=1e-08) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.9, last_epoch=-1) train_params = filter(lambda p: p.requires_grad, net.parameters()) optimizer = torch.optim.SGD(train_params, momentum=0.9, weight_decay=0.0001, lr=LearningRate) #scheduler = LR_Scheduler('poly', LearningRate, NUM_EPOCHS, len(train_dl))#lr_scheduler=['poly', 'step', 'cos'] scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS, eta_min=MIN_LR, last_epoch=-1) if warm_start: logging.info('warm_start: ' + last_checkpoint_path) net, _ = load_checkpoint(last_checkpoint_path, net) # using multi GPU if multi_gpu is not None: net = nn.DataParallel(net, device_ids=multi_gpu) #use sync_batchnorm #net = convert_model(net) diff = 0 best_val_metric = -0.1 optimizer.zero_grad() #seed = get_seed() #seed = SEED #logging.info('aug seed: '+str(seed)) #ia.imgaug.seed(seed) #np.random.seed(seed) for i_epoch in range(NUM_EPOCHS): scheduler.step(epoch=i_epoch) print('lr: %f' % scheduler.get_lr()[0]) t0 = time.time() # iterate through trainset if multi_gpu is not None: net.module.set_mode('train') else: net.set_mode('train') train_loss_list = [] #train_metric_list #logit_list, truth_list = [], [] for i, (images, masks) in enumerate(train_dl): ## adjust learning rate #scheduler(optimizer, i, i_epoch, best_val_metric) input_data = images.to(device=device, dtype=torch.float) #1 for non-zero-mask truth = (torch.sum(masks.reshape(masks.size()[0], masks.size()[1], -1), dim=2, keepdim=False) != 0).to(device=device, dtype=torch.float) logit = net(input_data) #logit_list.append(logit) #truth_list.append(truth) if multi_gpu is not None: _train_loss = net.module.criterion(logit, truth) #_train_metric = net.module.metric(logit, truth)#device='gpu' else: _train_loss = net.criterion(logit, truth) #_train_metric = net.metric(logit, truth)#device='gpu' train_loss_list.append(_train_loss.item()) #train_metric_list.append(_train_metric.item())#.detach() #grandient accumulation step=2 acc_step = 2 _train_loss = _train_loss / acc_step _train_loss.backward() if (i + 1) % acc_step == 0: optimizer.step() optimizer.zero_grad() train_loss = np.mean(train_loss_list) #train_metric = np.mean(train_metric_list) # if multi_gpu is not None: # train_metric, train_tn, train_fp, train_fn, train_tp, train_auc, train_pos_percent = net.module.metric(torch.cat(logit_list, dim=0), torch.cat(truth_list, dim=0)) # else: # train_metric, train_tn, train_fp, train_fn, train_tp, train_auc, train_pos_percent = net.metric(torch.cat(logit_list, dim=0), torch.cat(truth_list, dim=0)) # compute valid loss & metrics (concatenate valid set in cpu, then compute loss, metrics on full valid set) net.module.set_mode('valid') with torch.no_grad(): # val_loss_list, val_metric_list = [], [] # for i, (image, masks) in enumerate(val_dl): # input_data = image.to(device=device, dtype=torch.float) # truth = masks.to(device=device, dtype=torch.float) # logit = net(input_data) # if multi_gpu is not None: # _val_loss = net.module.criterion(logit, truth) # _val_metric = net.module.metric(logit, truth)#device='gpu' # else: # _val_loss = net.criterion(logit, truth) # _val_metric = net.metric(logit, truth)#device='gpu' # val_loss_list.append(_val_loss.item()) # val_metric_list.append(_val_metric.item())#.detach() # val_loss = np.mean(val_loss_list) # val_metric = np.mean(val_metric_list) logit_valid, truth_valid = None, None for j, (images, masks) in enumerate(val_dl): input_data = images.to(device=device, dtype=torch.float) #1 for non-zero-mask truth = (torch.sum(masks.reshape(masks.size()[0], masks.size()[1], -1), dim=2, keepdim=False) != 0).to(device=device, dtype=torch.float) logit = net(input_data) if logit_valid is None: logit_valid = logit truth_valid = truth else: logit_valid = torch.cat((logit_valid, logit), dim=0) truth_valid = torch.cat((truth_valid, truth), dim=0) if multi_gpu is not None: val_loss = net.module.criterion(logit_valid, truth_valid) _, val_metric, val_tn, val_fp, val_fn, val_tp, val_pos_percent = net.module.metric( logit_valid, truth_valid) else: val_loss = net.criterion(logit_valid, truth_valid) _, val_metric, val_tn, val_fp, val_fn, val_tp, val_pos_percent = net.metric( logit_valid, truth_valid) # Adjust learning_rate #scheduler.step(val_metric) # if i_epoch >= -1: #30 if val_metric > best_val_metric: best_val_metric = val_metric is_best = True diff = 0 else: is_best = False diff += 1 if diff > early_stopping_round: logging.info( 'Early Stopping: val_metric does not increase %d rounds' % early_stopping_round) #print('Early Stopping: val_iou does not increase %d rounds'%early_stopping_round) break else: is_best = False #save checkpoint checkpoint_dict = \ { 'epoch': i_epoch, 'state_dict': net.module.state_dict() if multi_gpu is not None else net.state_dict(), 'optim_dict' : optimizer.state_dict(), 'metrics': {'train_loss': train_loss, 'val_loss': val_loss, 'val_metric': val_metric} } save_checkpoint(checkpoint_dict, is_best=is_best, checkpoint=checkpoint_path) #if i_epoch%20==0: if i_epoch > -1: logging.info( '[EPOCH %05d]train_loss: %0.5f; val_loss, val_metric: %0.5f, %0.5f' % (i_epoch, train_loss.item(), val_loss.item(), val_metric)) logging.info('val_pos_percent: %.3f' % (val_pos_percent)) logging.info('val (tn, fp, fn, tp): %d, %d, %d, %d' % (val_tn, val_fp, val_fn, val_tp)) logging.info('time elapsed: %0.1f min' % ((time.time() - t0) / 60))
def run_check_net(train_dl, val_dl, multi_gpu=[0, 1], nonempty_only_loss=False): set_logger(LOG_PATH) logging.info('\n\n') #--- enc, dec = MODEL.split('_')[0], MODEL.split('_')[1] net = SegmentationModule(net_enc=enc, net_dec=dec).cuda(device=device) # for param in net.named_parameters(): # if param[0][:8] in ['decoder5']:#'decoder5', 'decoder4', 'decoder3', 'decoder2' # param[1].requires_grad = False # train_params = [{'params': net.get_1x_lr_params(), 'lr': LearningRate}, # {'params': net.get_10x_lr_params(), 'lr': LearningRate * 10}]#for resnet backbone train_params = filter(lambda p: p.requires_grad, net.parameters()) # dummy sgd to see if it can converge ... #optimizer = torch.optim.SGD(train_params, # lr=LearningRate, momentum=0.9, weight_decay=0.0001) #optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.045)#LearningRate #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', # factor=0.5, patience=4,#4 resnet34 # verbose=False, threshold=0.0001, # threshold_mode='rel', cooldown=0, # min_lr=0, eps=1e-08) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.9, last_epoch=-1) #for deeplabv3plus customized optimizer = torch.optim.SGD(train_params, momentum=0.9, weight_decay=0.0001, lr=LearningRate) scheduler = LR_Scheduler( 'poly', LearningRate, NUM_EPOCHS, len(train_dl)) #lr_scheduler=['poly', 'step', 'cos'] if warm_start: logging.info('warm_start: ' + last_checkpoint_path) net, _ = load_checkpoint(last_checkpoint_path, net) # using multi GPU if multi_gpu is not None: net = nn.DataParallel(net, device_ids=multi_gpu) #use sync_batchnorm #net = convert_model(net) diff = 0 best_val_metric = -0.1 optimizer.zero_grad() #seed = get_seed() #seed = SEED #logging.info('aug seed: '+str(seed)) #ia.imgaug.seed(seed) #np.random.seed(seed) for i_epoch in range(NUM_EPOCHS): ### adjust learning rate #scheduler.step(epoch=i_epoch) #print('lr: %f'%scheduler.get_lr()[0]) t0 = time.time() # iterate through trainset if multi_gpu is not None: net.module.set_mode('train') else: net.set_mode('train') train_loss_list, train_metric_list = [], [] #for seed in [1]:#[1, SEED]:#augment raw data with a duplicate one (augmented) #seed = get_seed() #np.random.seed(seed) #ia.imgaug.seed(i//10) for i, (image, masks) in enumerate(train_dl): ## adjust learning rate scheduler(optimizer, i, i_epoch, best_val_metric) input_data = image.to(device=device, dtype=torch.float) truth = masks.to(device=device, dtype=torch.float) #set_trace() logit, logit_clf = net(input_data) #[:, :3, :, :] if multi_gpu is not None: _train_loss = net.module.criterion(logit, truth, nonempty_only_loss, logit_clf) _train_metric = net.module.metric(logit, truth, nonempty_only_loss, logit_clf) #device='gpu' else: _train_loss = net.criterion(logit, truth, nonempty_only_loss, logit_clf) _train_metric = net.metric(logit, truth, nonempty_only_loss, logit_clf) #device='gpu' train_loss_list.append(_train_loss.item()) train_metric_list.append(_train_metric.item()) #.detach() #grandient accumulation step=2 acc_step = 1 _train_loss = _train_loss / acc_step _train_loss.backward() if (i + 1) % acc_step == 0: optimizer.step() optimizer.zero_grad() train_loss = np.mean(train_loss_list) train_metric = np.mean(train_metric_list) # compute valid loss & metrics (concatenate valid set in cpu, then compute loss, metrics on full valid set) net.module.set_mode('valid') with torch.no_grad(): val_loss_list, val_metric_list = [], [] for i, (image, masks) in enumerate(val_dl): input_data = image.to(device=device, dtype=torch.float) truth = masks.to(device=device, dtype=torch.float) logit, logit_clf = net(input_data) if multi_gpu is not None: _val_loss = net.module.criterion(logit, truth, nonempty_only_loss, logit_clf) _val_metric = net.module.metric(logit, truth, nonempty_only_loss, logit_clf) #device='gpu' else: _val_loss = net.criterion(logit, truth, nonempty_only_loss, logit_clf) _val_metric = net.metric(logit, truth, nonempty_only_loss, logit_clf) #device='gpu' val_loss_list.append(_val_loss.item()) val_metric_list.append(_val_metric.item()) #.detach() val_loss = np.mean(val_loss_list) val_metric = np.mean(val_metric_list) # logit_valid, truth_valid = None, None # for j, (image, masks) in enumerate(val_dl): # input_data = image.to(device=device, dtype=torch.float) # logit = net(input_data).cpu().float() # truth = masks.cpu().float() # if logit_valid is None: # logit_valid = logit # truth_valid = truth # else: # logit_valid = torch.cat((logit_valid, logit), dim=0) # truth_valid = torch.cat((truth_valid, truth), dim=0) # if multi_gpu is not None: # val_loss = net.module.criterion(logit_valid, truth_valid) # val_metric = net.module.metric(logit_valid, truth_valid) # else: # val_loss = net.criterion(logit_valid, truth_valid) # val_metric = net.metric(logit_valid, truth_valid) # Adjust learning_rate #scheduler.step(val_metric) #for 1024 trainging is harder, sometimes too early stop, force to at least train 40 epochs if i_epoch >= 10: #-1 if val_metric > best_val_metric: best_val_metric = val_metric is_best = True diff = 0 else: is_best = False diff += 1 if diff > early_stopping_round: logging.info( 'Early Stopping: val_metric does not increase %d rounds' % early_stopping_round) #print('Early Stopping: val_iou does not increase %d rounds'%early_stopping_round) break else: is_best = False #save checkpoint checkpoint_dict = \ { 'epoch': i, 'state_dict': net.module.state_dict() if multi_gpu is not None else net.state_dict(), 'optim_dict' : optimizer.state_dict(), 'metrics': {'train_loss': train_loss, 'val_loss': val_loss, 'train_metric': train_metric, 'val_metric': val_metric} } save_checkpoint(checkpoint_dict, is_best=is_best, checkpoint=checkpoint_path) #if i_epoch%20==0: if i_epoch > -1: logging.info( '[EPOCH %05d]train_loss, train_metric: %0.5f, %0.5f; val_loss, val_metric: %0.5f, %0.5f; time elapsed: %0.1f min' % (i_epoch, train_loss.item(), train_metric.item(), val_loss.item(), val_metric.item(), (time.time() - t0) / 60))
help="save once after N epochs") parser.add_argument('--end-epoch', type=int, default=20000, help="maximum number of training epoch") parser.add_argument('--random-seed', type=int, default=1, help='random seed (default: 1)') if not path.exists('models32'): os.mkdir('models32') if __name__ == "__main__": args = parser.parse_args() set_logger(log_file=args.log_file, debug_mode=args.debug_mode) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.manual_seed(args.random_seed) train_loader = FeaturesLoader(features_path=args.features_path, annotation_path=args.annotation_path) """ val_loader = FeaturesLoader(features_path=args.features_path, annotation_path=args.annotation_path_test) """ train_iter = torch.utils.data.DataLoader(train_loader, batch_size=args.batch_size, shuffle=True, num_workers=8,
predict_label = combined_result(predict_label, pattern='average') else: dev_acc = metrics_result[0] dev_f1 = metrics_result[1] logger.info("dev evaluate average Acc: {}, F1:{}".format(dev_acc, dev_f1)) file_name = '{}_{}_{:>.6f}.csv'.format( strftime("%m%d-%H%M%S", localtime()), config.language, dev_f1) predict_to_save(predict_label, path=config.result_save_path, file=file_name, prob_threshold=config.prob_threshold) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Chinese NER Task') parser.add_argument('--config', type=str, required=True, help='choose a config file') args = parser.parse_args() config_name = args.config import_config = import_module('configs.' + config_name) config = import_config.Config() random_seed(config.seed) set_logger(config.logging_dir, to_file=config.is_logging2file) Task(config)