def local_test_train(i,data_path): ''' Generates a debugging dataset to cifar10 This is for use in local CPU to test if the code is working or not. Try-catch structure can be applied. ''' ## success_nb = 0 while success_nb < 1: try: gene = Gene_data() number_parameters = 0 while number_parameters < gene.para_min or number_parameters > gene.para_max: gene.create_draft_order() gene.create_architecture_order() mdl, init_algorithm_list,init_hyerparam_list= gene.create_mdl() number_parameters = count_nb_params(mdl) print(init_algorithm_list) print(init_hyerparam_list) print(number_parameters) ## trainloader, valloader, testloader = get_cifar10_for_data_point_mdl_gen() print(mdl) ## start = time.time() mdl = mdl.to(device) epochs = 1 optimizer = torch.optim.Adam(mdl.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=1.0) criterion = nn.CrossEntropyLoss() error_criterion = metrics.error_criterion init_params = list(mdl.parameters()) stats_collector = StatsCollector() iterations, train_iterations = 1, 1 # CAREFUL it might be returining 1, for real production we want it to be greater than 1 trainer = Trainer(trainloader,valloader, testloader, optimizer, scheduler, criterion, error_criterion, stats_collector, device) train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats(mdl, epochs, iterations=iterations, train_iterations=train_iterations) final_params = list(mdl.parameters()) ## save data point how_long, hours = timeSince(start) print(f'hours = {hours}') print(f'{how_long}') mdl_name = f'debug_{i}' other_data = trainer.stats_collector.get_stats_dict({'error_criterion':error_criterion.__name__}) batch_size_train = trainloader.batch_size batch_size_test = testloader.batch_size batch_size_val = valloader.batch_size save_model_info(data_path, mdl, init_params, final_params, train_loss, train_error, val_loss, val_error, test_loss, test_error, optimizer, epochs, criterion, error_criterion, hours, mdl_name, init_algorithm_list, init_hyerparam_list, batch_size_train, batch_size_val, batch_size_test, number_parameters, scheduler, other_data) success_nb =success_nb + 1 print('Success') except Exception as e: print('FAIL') print(e)
def generate_debug_dataset(self): ''' Generates a debugging dataset to cifar10 ''' self.path.mkdir(exist_ok=True) iterations, train_iterations = 1, 1 # CAREFUL it might be returining 1, for real production we want it to be greater than 1 mdls = get_debug_models() print() for i in range(len(mdls)): print(f'---> mdl_{i}') start = time.time() ## generate mdl data point mdl = mdls[i].to(device) epochs = random.randint(self.min_train_epochs, self.max_train_epochs) # CAREFUL it might be returining 1, for real production we want it to be greater than 1 optimizer = torch.optim.Adam(mdl.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=1.0) criterion = nn.CrossEntropyLoss() error_criterion = metrics.error_criterion init_params = list(mdl.parameters()) stats_collector = StatsCollector() trainer = Trainer(self.trainloader, self.valloader, self.testloader, optimizer, scheduler, criterion, error_criterion, stats_collector, device) train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats(mdl, epochs, iterations=iterations, train_iterations=train_iterations) final_params = list(mdl.parameters()) ## save data point how_long, hours = timeSince(start) print(f'hours = {hours}') print(f'{how_long}') mdl_name = f'debug_{i}' other_data = trainer.stats_collector.get_stats_dict({'error_criterion':error_criterion.__name__}) # TODO: fix later init_algorithm_list = 'default' init_hyerparam_list = torch.__version__ number_parameters = count_nb_params(mdl) ## batch_size_train, batch_size_val, batch_size_test = self.trainloader.batch_size, self.valloader.batch_size, self.testloader.batch_size data_path = str(self.path) save_model_info(data_path, mdl, init_params, final_params, train_loss, train_error, val_loss, val_error, test_loss, test_error, optimizer, epochs, criterion, error_criterion, hours, mdl_name, init_algorithm_list, init_hyerparam_list, batch_size_train, batch_size_val, batch_size_test, number_parameters, scheduler=scheduler, other_data=other_data) print(f'--> mdl_{i} data point saved!\n')
def main(i, gene, data_path, epochs, mdl_name): ''' The main train fucntion to be used on GPU. Have a try-catch structure. i: each call will run i times and save i data points. gene: an object of Gene_data. Can change the default parameters in this fucntion. The defaul inputs are: ( min_conv_n = 1, max_conv_n = 7, min_fc_n = 1, max_fc_n = 7, para_min = 40000, min_filter = 26,min_fc = 32, max_filter = 32, max_fc = 256,max_para_times = 50, flag = True,default_init_w_algor = False)) data data_path: the root of where to save the results of training. epochs: number of epochs to train. mdl_name: change the mdl name. ''' # get model type success_nb = 0 while success_nb < 1: try: number_parameters = 0 while number_parameters < gene.para_min or number_parameters > gene.para_max: gene.generate_random_inputs() gene.create_architecture_order() mdl, init_algorithm_list, init_hyerparam_list = gene.create_mdl( ) number_parameters = count_nb_params(mdl) print(number_parameters) trainloader, valloader, testloader = get_cifar10_for_data_point_mdl_gen( ) ## create directory to save models make_and_check_dir(data_path) ## start creating models and its variations start = time.time() ## generate mdl data point mdl = mdl.to(device) optimizer = torch.optim.Adam(mdl.parameters()) criterion = nn.CrossEntropyLoss() error_criterion = metrics.error_criterion scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=1.0) stats_collector = StatsCollector() trainer = Trainer(trainloader, valloader, testloader, optimizer, scheduler, criterion, error_criterion, stats_collector, device) init_params = list(mdl.parameters()) train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats( mdl, epochs) final_params = list(mdl.parameters()) ## save data point how_long, seconds, minutes, hours = report_times(start) print(f'hours = {hours}') print(how_long) # mdl_name = f'tower_mdl_{i}' other_data = trainer.stats_collector.get_stats_dict( {'error_criterion': error_criterion.__name__}) batch_size_train = trainloader.batch_size batch_size_test = testloader.batch_size batch_size_val = valloader.batch_size save_model_info(data_path, mdl, init_params, final_params, train_loss, train_error, val_loss, val_error, test_loss, test_error, optimizer, epochs, criterion, error_criterion, hours, mdl_name, init_algorithm_list, init_hyerparam_list, batch_size_train, batch_size_val, batch_size_test, number_parameters, scheduler, other_data) success_nb = success_nb + 1 print('Success') except Exception as e: print('FAIL') print(e)
def main(): ''' ''' USE_CUDA = torch.cuda.is_available() device = torch.device("cuda" if USE_CUDA else "cpu") ## paths to automl data set # data_path = '~/predicting_generalization/automl/data/automl_dataset_debug' data_path_save = '/home/xiaot6/cs446-project-fa2019/automl/data/set1' #where you store results data_path_test = '/home/xiaot6/split_set/test' data_path_train = '/home/xiaot6/split_set/train' data_path_val = '/home/xiaot6/split_set/val' path = Path(data_path_save).expanduser() ## Vocab vocab = Vocab() V_a, V_hp = len(vocab.architecture_vocab), len(vocab.hparms_vocab) ## create dataloader for meta learning data set batch_first = True # dataset = MetaLearningDataset(data_path, vocab) dataset_test = MetaLearningDataset(data_path_test, vocab) dataset_train = MetaLearningDataset(data_path_train, vocab) dataset_val = MetaLearningDataset(data_path_val, vocab) collate_fn = Collate_fn_onehot_general_features(device, batch_first, vocab) batch_size = 512 # dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) trainloader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, collate_fn=collate_fn) testloader = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, collate_fn=collate_fn) valloader = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, collate_fn=collate_fn) ## instantiate Meta Learner # arch hps arch_input_size = V_a arch_hidden_size = 16 arch_num_layers = 1 # arch_hp hps arch_hp_input_size = V_hp arch_hp_hidden_size = arch_hidden_size arch_hp_num_layers = 1 # opt hps # st() # input1 = dataset[0]['train_history'].view(batch_size,-1) # input2 = dataset[0]['val_history'].view(batch_size,-1) # st() # seq_len = len(dataset[0]['test_errors']) # since they all have the same seq_len seq_len = len( dataset_test[0]['test_errors']) # since they all have the same seq_len input_dim = 4 # 4 is cuz we have 2 losses, errors CSEloss but we have val and train so 2*2=4 opt_input_size = input_dim # so that it process one time step of the history at a time [train_err,train_loss,val_loss,val_err] opt_hidden_size = arch_hidden_size opt_num_layers = 1 # weight stats weight_stats_input_size = 3 # 3 because we are only processing init params stats mu, std l2. if we also process all final param stats this would be 6 weight_stats_hidden_size = arch_hidden_size weight_stats_layers = 1 ## train error hps train_err_input_size = 1 train_err_hidden_size = arch_hidden_size num_layers_num_layers = 1 # meta-learner chain lstm meta_learner = ChainLSTM(arch_input_size=arch_input_size, arch_hidden_size=arch_hidden_size, arch_num_layers=1, arch_hp_input_size=arch_hp_input_size, arch_hp_hidden_size=arch_hp_hidden_size, arch_hp_num_layers=1, weight_stats_input_size=weight_stats_input_size, weight_stats_hidden_size=weight_stats_hidden_size, weight_stats_layers=weight_stats_layers, opt_input_size=opt_input_size, opt_hidden_size=opt_hidden_size, opt_num_layers=opt_num_layers, train_err_input_size=train_err_input_size, train_err_hidden_size=train_err_hidden_size, num_layers_num_layers=num_layers_num_layers, device=device) ## # trainloader, valloader, testloader = dataloader, dataloader, dataloader # TODO this is just for the sake of an example! optimizer = torch.optim.Adam(meta_learner.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=1.0) criterion = torch.nn.MSELoss() # error_criterion = criterion # TODO: implement epsilon classification loss init_params = list(meta_learner.parameters()) error_criterion = metrics.error_criterion stats_collector = StatsCollector() device = device trainer = Trainer(trainloader, valloader, testloader, optimizer, scheduler, criterion, error_criterion, stats_collector, device) ## final_params = list(meta_learner.parameters()) batch_size_train = trainloader.batch_size batch_size_test = testloader.batch_size batch_size_val = valloader.batch_size nb_epochs = 3 #500 #50 train_iterations = inf # TODO: CHANGE for model to be fully trained!!! # inf train_loss, train_error, val_loss, val_error, test_loss, test_error = trainer.train_and_track_stats( meta_learner, nb_epochs, iterations=4, train_iterations=train_iterations) other_data = trainer.stats_collector.get_stats_dict( {'error_criterion': error_criterion.__name__}) # other_data = trainer.stats_collector.get_stats_dict({'error_criterion':error_criterion}) save_model_info_lstm(data_path_save, train_loss, train_error, val_loss, val_error, test_loss, test_error, nb_epochs, optimizer, batch_size_train, batch_size_val, batch_size_test, scheduler, other_data) print('done')