def main(): # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_path = './runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) if not os.path.exists(save_path): os.makedirs(save_path) log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") # TODO: gives interrupted sys call error # log_file = os.path.join(save_path, "stdout") # sys.stdout = Logger(log_file) # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) log.write(arg + ':' + str(getattr(args, arg)) + '\n') log.close() # Initialize the weights of the model print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) # Dataset loading # TODO: hard-coded file paths patch_size = args.patch_size data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) gen = QLearner(state_space_parameters, 1, WeightInitializer, device, args, save_path, qstore = args.qstore_path,\ replaydict = args.replay_dict_path) if (args.continue_epsilon not in np.array(state_space_parameters.epsilon_schedule)[:, 0]): raise ValueError('continue-epsilon {} not in epsilon schedule!'.format( args.continue_epsilon)) for episode in state_space_parameters.epsilon_schedule: epsilon = episode[0] M = episode[1] for ite in range(1, M + 1): if epsilon == args.continue_epsilon and args.continue_ite > M: raise ValueError( 'continue-ite {} not within range of continue-epsilon {} in epsilon schedule!' .format(args.continue_ite, epsilon)) if (epsilon == args.continue_epsilon and ite >= args.continue_ite ) or (epsilon < args.continue_epsilon): print('ite:{}, epsilon:{}'.format(ite, epsilon)) gen.generate_net(epsilon, dataset) gen.replay_dictionary.to_csv(os.path.join(save_path, 'replayDictFinal.csv')) gen.qstore.save_to_csv(os.path.join(save_path, 'qValFinal.csv'))
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) if args.cross_dataset and not args.incremental_data: raise ValueError( 'cross-dataset training possible only if incremental-data flag set' ) # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\ '_variational_samples_' + str(args.var_samples) + '_latent_dim_' + str(args.var_latent_dim) # add option specific naming to separate tensorboard log files later if args.autoregression: save_path += '_pixelcnn' if args.incremental_data: save_path += '_incremental' if args.train_incremental_upper_bound: save_path += '_upper_bound' if args.generative_replay: save_path += '_genreplay' if args.openset_generative_replay: save_path += '_opensetreplay' if args.cross_dataset: save_path += '_cross_dataset_' + args.dataset_order # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL epoch_multiplier = 1 if args.incremental_data: from lib.Datasets.incremental_dataset import get_incremental_dataset # get the method to create the incremental dataste (inherits from the chosen data loader) inc_dataset_init_method = get_incremental_dataset( data_init_method, args) # different options for class incremental vs. cross-dataset experiments if args.cross_dataset: # if a task order file is specified, load the task order from it if args.load_task_order: # check if file exists and if file ends with extension '.txt' if os.path.isfile(args.load_task_order) and len(args.load_task_order) >= 4\ and args.load_task_order[-4:] == '.txt': print("=> loading task order from '{}'".format( args.load_task_order)) with open(args.load_task_order, 'rb') as fp: task_order = pickle.load(fp) # if no file is found default to cmd line task order else: # parse and split string at commas task_order = args.dataset_order.split(',') for i in range(len(task_order)): # remove blank spaces in dataset names task_order[i] = task_order[i].replace(" ", "") # use task order as specified in command line else: # parse and split string at commas task_order = args.dataset_order.split(',') for i in range(len(task_order)): # remove blank spaces in dataset names task_order[i] = task_order[i].replace(" ", "") # just for getting the number of classes in the first dataset num_classes = 0 for i in range(args.num_base_tasks): temp_dataset_init_method = getattr(datasets, task_order[i]) temp_dataset = temp_dataset_init_method( torch.cuda.is_available(), args) num_classes += temp_dataset.num_classes del temp_dataset # multiply epochs by number of tasks if args.num_increment_tasks: epoch_multiplier = ((len(task_order) - args.num_base_tasks) / args.num_increment_tasks) + 1 else: # this branch will get active if num_increment_tasks is set to zero. This is useful when training # any isolated upper bound with all datasets present from the start. epoch_multiplier = 1.0 else: # class incremental # if specified load task order from file if args.load_task_order: if os.path.isfile(args.load_task_order): print("=> loading task order from '{}'".format( args.load_task_order)) task_order = np.load(args.load_task_order).tolist() else: # if no file is found a random task order is created print( "=> no task order found. Creating randomized task order" ) task_order = np.random.permutation(num_classes).tolist() else: # if randomize task order is specified create a random task order, else task order is sequential task_order = [] for i in range(dataset.num_classes): task_order.append(i) if args.randomize_task_order: task_order = np.random.permutation(num_classes).tolist() # save the task order np.save(os.path.join(save_path, 'task_order.npy'), task_order) # set the number of classes to base tasks + 1 because base tasks is always one less. # E.g. if you have 2 classes it's one task. This is a little inconsistent from the naming point of view # but we wanted a single variable to work for both class incremental as well as cross-dataset experiments num_classes = args.num_base_tasks + 1 # multiply epochs by number of tasks epoch_multiplier = ( (len(task_order) - (args.num_base_tasks + 1)) / args.num_increment_tasks) + 1 print("Task order: ", task_order) # log the task order into the text file log.write('task_order:' + str(task_order) + '\n') args.task_order = task_order # this is a little weird, but it needs to be here because the below method pops items from task_order args_to_tensorboard(writer, args) assert epoch_multiplier.is_integer(), print( "uneven task division, make sure number of tasks are integers.") # Get the incremental dataset dataset = inc_dataset_init_method(torch.cuda.is_available(), device, task_order, args) else: # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # Get a sample input from the data loader to infer color channels/size net_input, _ = next(iter(dataset.train_loader)) # get the amount of color channels in the input images num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # if we are not building an autoregressive model the number of output channels of the model is equivalent to # the amount of input channels. For an autoregressive models we set the number of output channels of the # non-autoregressive decoder portion according to the command line option below if not args.autoregression: args.out_channels = num_colors # build the model model = net_init_method(device, num_classes, num_colors, args) # optionally add the autoregressive decoder if args.autoregression: model.pixelcnn = PixelCNN(device, num_colors, args.out_channels, args.pixel_cnn_channels, num_layers=args.pixel_cnn_layers, k=args.pixel_cnn_kernel_size, padding=args.pixel_cnn_kernel_size // 2) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. Final amount of epochs is determined through the while epoch < (args.epochs * epoch_multiplier): # visualize the latent space before each task increment and at the end of training if it is 2-D if epoch % args.epochs == 0 and epoch > 0 or (epoch + 1) % ( args.epochs * epoch_multiplier) == 0: if model.module.latent_dim == 2: print("Calculating and visualizing dataset embedding") # infer the number of current tasks to plot the different classes in the embedding if args.incremental_data: if args.cross_dataset: num_tasks = sum( dataset.num_classes_per_task[:len(dataset. seen_tasks)]) else: num_tasks = len(dataset.seen_tasks) else: num_tasks = num_classes zs = get_latent_embedding(model, dataset.train_loader, num_tasks, device) visualize_dataset_in_2d_embedding(writer, zs, args.dataset, save_path, task=num_tasks) # continual learning specific part if args.incremental_data: # at the end of each task increment if epoch % args.epochs == 0 and epoch > 0: print('Saving the last checkpoint from the previous task ...') save_task_checkpoint(save_path, epoch // args.epochs) print("Incrementing dataset ...") dataset.increment_tasks( model, args.batch_size, args.workers, writer, save_path, is_gpu=torch.cuda.is_available(), upper_bound_baseline=args.train_incremental_upper_bound, generative_replay=args.generative_replay, openset_generative_replay=args.openset_generative_replay, openset_threshold=args.openset_generative_replay_threshold, openset_tailsize=args.openset_weibull_tailsize, autoregression=args.autoregression) # grow the classifier and increment the variable for number of overall classes so we can use it later if args.cross_dataset: grow_classifier( model.module.classifier, sum(dataset.num_classes_per_task[:len(dataset. seen_tasks)]) - model.module.num_classes, WeightInitializer) model.module.num_classes = sum( dataset.num_classes_per_task[:len(dataset.seen_tasks)]) else: model.module.num_classes += args.num_increment_tasks grow_classifier(model.module.classifier, args.num_increment_tasks, WeightInitializer) # reset moving averages etc. of the optimizer optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) # change the number of seen classes if epoch % args.epochs == 0: model.module.seen_tasks = dataset.seen_tasks # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, save_path, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 # if a new task begins reset the best prec so that new best model can be stored. if args.incremental_data and epoch % args.epochs == 0: best_prec = 0 best_loss = random.getrandbits(128) writer.close()
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) if args.debug: pdb.set_trace() # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime( "%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL epoch_multiplier = 1 # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # build the model model = architectures.Inos_model(args.num_class, args) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) if not args.pretrained: # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=0.9, weight_decay=2e-4) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[30, 60, 80, 100], gamma=0.5) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. Final amount of epochs is determined through the while epoch < (args.epochs * epoch_multiplier): if epoch + 2 == epoch % args.epochs: print("debug perpose") # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, save_path, args) # evaluate on test set prec_t, loss_t = test(dataset, model, criterion, epoch, writer, device, save_path, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 scheduler.step() writer.close()
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cudnn.benchmark = True num_GPUs = torch.cuda.device_count() # If save directory for runs doesn't exist then create it if not os.path.exists('runs'): os.mkdir('runs') # Create a time-stamped save path for individual experiment save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + \ ';' + args.dataset + ';' + args.architecture os.mkdir(save_path) # List of values to log to csv columns_list = [ 'Filters', 'Parameters', 'Mean', 'Variance', 'Skew', 'BestVal', 'BestValsTrain', 'BestEpoch', 'LastValPrec', 'LastTrainPrec', 'AllTrain', 'AllVal' ] df = pd.DataFrame(columns=columns_list) # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the amount of color channels in the input images net_input, _ = next(iter(dataset.train_loader)) num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # Get the parameters for all valid skewed models SNModels = SkewNormalModels(depth=args.vgg_depth, num_classes=dataset.num_classes, patch_size=args.patch_size) skew_model_params = SNModels.get_valid_models() print("Total number of models: ", len(skew_model_params["filters"])) # Weight-init method WeightInitializer = WeightInit(args.weight_init) # Optionally resume a previous experiment current_id = args.resume_model_id for i in range(len(skew_model_params["filters"]) - current_id): print("Model filters: ", skew_model_params["filters"][i + current_id]) print("Model parameters: ", skew_model_params["total_params"][i + current_id], " mean: ", skew_model_params["means"][i + current_id], " var: ", skew_model_params["vars"][i + current_id], " skew: ", skew_model_params["skews"][i + current_id]) model = net_init_method(device, dataset.num_classes, num_colors, args, skew_model_params["filters"][i + current_id], custom_filters=True) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model print("Initializing networks with: " + args.weight_init) WeightInitializer.init_model(model) # Define criterion and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Initialize SGDWR learning rate scheduler lr_scheduler = LearningRateScheduler(args.lr_wr_epochs, len(dataset.train_loader.dataset), args.batch_size, args.learning_rate, args.lr_wr_mul, args.lr_wr_min) # Get estimated GPU memory usage of the model and split batch if too little memory is available if torch.cuda.is_available(): GPUMemory = GPUMem(torch.cuda.is_available()) print('available:{}'.format( (GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.)) print('required per gpu with buffer: {}'.format( (4. / float(num_GPUs) * model.module.gpu_usage) + 1.)) # calculate smaller chunk size to split batch into sequential computations mem_scale_factor = 4.0 # TODO: WEIRD factor... why is this necessary and where does it come from? # TODO: the + 1 Gb should be taken from the cache allocator if ((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.) < ( (mem_scale_factor / float(num_GPUs) * model.module.gpu_usage) + 1.): # code for variable batch size implementation as per gpu constraint; remove for old code approx_small_batch_size = (((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024. - 1.) * float(num_GPUs) / mem_scale_factor) //\ (model.module.gpu_usage / float(args.batch_size)) diff = float('inf') temp_small_batch_size = approx_small_batch_size for j in range(1, (args.batch_size // 2) + 1): if args.batch_size % j == 0 and abs( j - approx_small_batch_size) < diff: diff = abs(j - approx_small_batch_size) temp_small_batch_size = j batch_seq_split_size = temp_small_batch_size else: batch_seq_split_size = args.batch_size else: batch_seq_split_size = args.batch_size # Get training and validation dataset loaders dataset.train_loader, dataset.val_loader = dataset.get_dataset_loader( batch_seq_split_size, args.workers, device) print( 'sequential batch size split size:{}'.format(batch_seq_split_size)) epoch = 0 best_epoch = 0 best_prec = 0 best_val_train_prec = 0 all_train = [] all_val = [] while epoch < args.epochs: # train for one epoch train_prec = train(dataset.train_loader, model, criterion, epoch, optimizer, lr_scheduler, device, batch_seq_split_size, args) # evaluate on validation set prec = validate(dataset.val_loader, model, criterion, epoch, device, args) all_train.append(train_prec) all_val.append(prec) # remember best prec@1 and save checkpoint is_best = prec > best_prec if is_best: best_epoch = epoch best_val_train_prec = train_prec best_prec = prec # if architecture doesn't train at all skip it if epoch == args.lr_wr_epochs - 1 and train_prec < ( 2 * 100.0 / dataset.num_classes): break # increment epoch counters epoch += 1 lr_scheduler.scheduler_epoch += 1 # append architecture results to csv df = df.append(pd.DataFrame([[ skew_model_params["filters"][i + current_id], skew_model_params["total_params"][i + current_id], skew_model_params["means"][i + current_id], skew_model_params["vars"][i + current_id], skew_model_params["skews"][i + current_id], best_prec, best_val_train_prec, best_epoch, prec, train_prec, all_train, all_val ]], columns=columns_list), ignore_index=True) df.to_csv(save_path + '/model_%03d' % (i + 1 + current_id) + '.csv') del model del optimizer
def main(): # set device for torch computations device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_path = './runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) if not os.path.exists(save_path): os.makedirs(save_path) # parse command line arguments args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # create log file log_file = os.path.join(save_path, "stdout") # write parsed args to log file log = open(log_file, "a") for arg in vars(args): print(arg, getattr(args, arg)) log.write(arg + ':' + str(getattr(args, arg)) + '\n') log.close() # instantiate the weight initializer print("Initializing network with: " + args.weight_init) weight_initializer = WeightInit(args.weight_init) # instantiate dataset object data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # instantiate a tabular Q-learner q_learner = QLearner(args, dataset.num_classes, save_path) # start new architecture search if int(args.task) == 1: if args.continue_search is True: # raise exceptions if requirements to start new search not met if args.continue_epsilon not in np.array( state_space_parameters.epsilon_schedule)[:, 0]: raise ValueError( 'continue-epsilon {} not in epsilon schedule!'.format( args.continue_epsilon)) if (args.replay_buffer_csv_path is None) or (not os.path.exists( args.replay_buffer_csv_path)): raise ValueError( 'specify correct path to replay buffer to continue ') if (args.q_values_csv_path is None) or (not os.path.exists( args.q_values_csv_path)): raise ValueError('wrong path is specified for Q-values') # iterate as per the epsilon-greedy schedule for episode in state_space_parameters.epsilon_schedule: epsilon = episode[0] m = episode[1] # raise exception if net number to continue from greater than number of nets for the continue_epsilon if epsilon == args.continue_epsilon and args.continue_ite > m: raise ValueError( 'continue-ite {} not within range of continue-epsilon {} in epsilon schedule!' .format(args.continue_ite, epsilon)) # iterate through number of nets for an epsilon for ite in range(1, m + 1): # check conditions to generate and train arc if (epsilon == args.continue_epsilon and ite >= args.continue_ite) or ( epsilon < args.continue_epsilon): print('ite:{}, epsilon:{}'.format(ite, epsilon)) # generate net states for search q_learner.generate_search_net_states(epsilon) # check if net already trained before search_net_in_replay_dict = q_learner.check_search_net_in_replay_buffer( ) # add to the end of the replay buffer if net already trained before if search_net_in_replay_dict: q_learner.add_search_net_to_replay_buffer( search_net_in_replay_dict, verbose=True) # train net if net not trained before else: # train/val search net mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs,\ train_flag, hard_best_background, hard_best_crack, hard_best_spallation,\ hard_best_exposed_bars, hard_best_efflorescence, hard_best_corrosion_stain =\ train_val_net(q_learner.state_list, dataset, weight_initializer, device, args, save_path) # check if net fits memory while mem_fit is False: print( "net failed mem check even with batch splitting, sampling again!" ) q_learner.generate_search_net_states(epsilon) net_in_replay_dict = q_learner.check_search_net_in_replay_buffer( ) if search_net_in_replay_dict: q_learner.add_search_net_to_replay_buffer( net_in_replay_dict) break else: mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, \ soft_val_all_epochs, train_flag, hard_best_background, hard_best_crack,\ hard_best_spallation, hard_best_exposed_bars, hard_best_efflorescence,\ hard_best_corrosion_stain =\ train_val_net(q_learner.state_list, dataset, weight_initializer, device, args, save_path) # add new net and performance measures to replay buffer if it fits in memory after splitting # batch if mem_fit: reward = q_learner.accuracies_to_reward( hard_val_all_epochs) q_learner.add_search_net_to_replay_buffer( search_net_in_replay_dict, spp_size=spp_size, reward=reward, hard_best_val=hard_best_val, hard_val_all_epochs=hard_val_all_epochs, soft_best_val=soft_best_val, soft_val_all_epochs=soft_val_all_epochs, train_flag=train_flag, hard_best_background=hard_best_background, hard_best_crack=hard_best_crack, hard_best_spallation=hard_best_spallation, hard_best_exposed_bars=hard_best_exposed_bars, hard_best_efflorescence=hard_best_efflorescence, hard_best_corrosion_stain= hard_best_corrosion_stain, verbose=True) # sample nets from replay buffer, update Q-values and save partially filled replay buffer and # Q-values q_learner.update_q_values_and_save_partial() # save fully filled replay buffer and final Q-values q_learner.save_final() # load single architecture config from replay buffer and train till convergence elif int(args.task) == 2: # raise exceptions if requirements to continue incomplete search not met if (args.replay_buffer_csv_path is None) or (not os.path.exists( args.replay_buffer_csv_path)): raise ValueError('wrong path specified for replay buffer') if int(args.fixed_net_index_no) < 0: raise ValueError( 'specify a non negative integer for fixed net index') # generate states for fixed net from a complete search q_learner.generate_fixed_net_states() # train/val fixed net exhaustively mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs, train_flag,\ hard_best_background, hard_best_crack, hard_best_spallation, hard_best_exposed_bars, hard_best_efflorescence, \ hard_best_corrosion_stain = train_val_net(q_learner.state_list, dataset, weight_initializer, device, args, save_path) # add fixed net and performance measures to a data frame and save it q_learner.add_fixed_net_to_fixed_net_buffer( spp_size=spp_size, hard_best_val=hard_best_val, hard_val_all_epochs=hard_val_all_epochs, soft_best_val=soft_best_val, soft_val_all_epochs=soft_val_all_epochs, hard_best_background=hard_best_background, hard_best_crack=hard_best_crack, hard_best_spallation=hard_best_spallation, hard_best_exposed_bars=hard_best_exposed_bars, hard_best_efflorescence=hard_best_efflorescence, hard_best_corrosion_stain=hard_best_corrosion_stain) # save fixed net buffer q_learner.save_final() # raise exception if no matching task else: raise NotImplementedError('Given task no. not implemented.')
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # import the correct loss and training functions depending which model to optimize # TODO: these could easily be refactored into one function, but we kept it this way for modularity if args.train_var: if args.joint: from lib.Training.train import train_var_joint as train from lib.Training.validate import validate_var_joint as validate from lib.Training.loss_functions import var_loss_function_joint as criterion else: from lib.Training.train import train_var as train from lib.Training.validate import validate_var as validate from lib.Training.loss_functions import var_loss_function as criterion else: if args.joint: from lib.Training.train import train_joint as train from lib.Training.validate import validate_joint as validate from lib.Training.loss_functions import loss_function_joint as criterion else: from lib.Training.train import train as train from lib.Training.validate import validate as validate from lib.Training.loss_functions import loss_function as criterion # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\ '_dropout_' + str(args.dropout) if args.train_var: save_path += '_variational_samples_' + str( args.var_samples) + '_latent_dim_' + str(args.var_latent_dim) if args.joint: save_path += '_joint' # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # Get a sample input from the data loader to infer color channels/size net_input, _ = next(iter(dataset.train_loader)) # get the amount of color channels in the input images num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # build the model model = net_init_method(device, num_classes, num_colors, args) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. while epoch < args.epochs: # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 writer.close()