def __init__(self, json_path, data_dir, validate, ckpt_dir, log_dir, restore): self.params = Params(json_path) self.valid = 1 if validate == '1' else 0 self.model = face_model(self.params) self.lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( self.params.learning_rate, decay_steps=10000, decay_rate=0.96, staircase=True) self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=0.1) self.checkpoint = tf.train.Checkpoint( model=self.model, optimizer=self.optimizer, train_steps=tf.Variable(0, dtype=tf.int64), valid_steps=tf.Variable(0, dtype=tf.int64), epoch=tf.Variable(0, dtype=tf.int64)) self.ckptmanager = tf.train.CheckpointManager(self.checkpoint, ckpt_dir, 3) if self.params.triplet_strategy == "batch_all": self.loss = batch_all_triplet_loss elif self.params.triplet_strategy == "batch_hard": self.loss = batch_hard_triplet_loss elif self.params.triplet_strategy == "batch_adaptive": self.loss = adapted_triplet_loss current_time = datetime.datetime.now().strftime("%d-%m-%Y_%H%M%S") log_dir += current_time + '/train/' self.train_summary_writer = tf.summary.create_file_writer(log_dir) if restore == '1': self.checkpoint.restore(self.ckptmanager.latest_checkpoint) print( f'\nRestored from Checkpoint : {self.ckptmanager.latest_checkpoint}\n' ) else: print('\nIntializing from scratch\n') self.train_dataset, self.train_samples = get_dataset( data_dir, self.params, 'train') if self.valid: self.valid_dataset, self.valid_samples = get_dataset( data_dir, self.params, 'val')
def evaluate(save_path, checkpoint_name="weights.ckpt"): # Load config config = parse_gin_config(os.path.join(save_path, "config.gin")) gin.parse_config_files_and_bindings([os.path.join(os.path.join(save_path, "config.gin"))], bindings=[""]) # Create dynamically dataset generators train, valid, test, meta_data = get_dataset(batch_size=config['train.batch_size'], seed=config['train.seed']) # Load model (a bit hacky, but necessary because load_from_checkpoint seems to fail) ckpt_path = os.path.join(save_path, checkpoint_name) ckpt = torch.load(ckpt_path) model = models.__dict__[config['train.model']]() summary(model) pl_module = SupervisedLearning(model, lr=0.0) pl_module.load_state_dict(ckpt['state_dict']) # NOTE: This fails, probably due to a bug in Pytorch Lightning. The above is manually doing something similar # ckpt_path = os.path.join(save_path, checkpoint_name) # pl_module = SupervisedLearning.load_from_checkpoint(ckpt_path) trainer = pl.Trainer() results, = trainer.test(model=pl_module, test_dataloaders=test, ckpt_path=ckpt_path) logger.info(results) with open(os.path.join(save_path, "eval_results_{}.json".format(checkpoint_name)), "w") as f: json.dump(results, f)
def model_search(dataset, backbone, val_split, imgsize, batch_size, output_path, gpu_cnt, debug_mode=False): """ Function for model search :param dataset: dataset path :param backbone: one of the 'Load_Base_Model' file, please refer to ./src/load_base_model.py file :param output_path: model .h5 file output :param gpu_cnt: the gpu number to use """ model_path, log_path = make_file(output_path, backbone) gen_train, gen_valid, params = get_dataset(dataset_path=dataset, model_path=model_path, batch_size=batch_size, imgsize=imgsize, val_split=val_split, debug=debug_mode) ht = HyperTuner(data_params=params, imgsize=imgsize, backbone=backbone, gen_train=gen_train, gen_valid=gen_valid, model_path=model_path, log_path=log_path, gpu_cnt=gpu_cnt) # HyperTuning Optimization ht.optimize()
def train(save_path, model, lr=0.1, batch_size=128, callbacks=[]): # Create dynamically dataset generators train, valid, test, meta_data = get_dataset(batch_size=batch_size) # Create dynamically model model = models.__dict__[model]() summary(model) loss_function = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) # Create dynamically callbacks callbacks_constructed = [] for name in callbacks: clbk = get_callback(name, verbose=0) if clbk is not None: callbacks_constructed.append(clbk) # Pass everything to the training loop steps_per_epoch = (len(meta_data['x_train']) - 1) // batch_size + 1 training_loop(model=model, optimizer=optimizer, loss_function=loss_function, metrics=[acc], train=train, valid=test, meta_data=meta_data, steps_per_epoch=steps_per_epoch, save_path=save_path, config=_CONFIG, use_tb=True, custom_callbacks=callbacks_constructed)
def train(save_path, model, batch_size=128, seed=777, callbacks=[], resume=True, evaluate=True): # Create dynamically dataset generators train, valid, test, meta_data = get_dataset(batch_size=batch_size, seed=seed) # Create dynamically model model = models.__dict__[model]() summary(model) # Create dynamically callbacks callbacks_constructed = [] for name in callbacks: clbk = get_callback(name, verbose=0) if clbk is not None: callbacks_constructed.append(clbk) if not resume and os.path.exists(os.path.join(save_path, "last.ckpt")): raise IOError( "Please clear folder before running or pass train.resume=True") # Create module and pass to trianing checkpoint_callback = ModelCheckpoint( filepath=os.path.join(save_path, "weights"), verbose=True, save_last=True, # For resumability monitor='valid_acc', mode='max') pl_module = supervised_training.SupervisedLearning(model, meta_data=meta_data) trainer = training_loop(train, valid, pl_module=pl_module, checkpoint_callback=checkpoint_callback, callbacks=callbacks_constructed, save_path=save_path) # Evaluate if evaluate: results, = trainer.test(test_dataloaders=test) logger.info(results) with open(os.path.join(save_path, "eval_results.json"), "w") as f: json.dump(results, f)
def combine_weights(weights_init, weights_final, config, step=1): train, valid, test, meta_data = get_dataset( batch_size=config['train.batch_size'], seed=config['train.seed']) results = {'freq': [], 'train': [], 'valid': [], 'test': []} num_steps = int(1 / step) print("step_size: {step} | num_of_steps: {num_steps}") for i in range(num_steps + 1): freq = i * step weights_temp = collections.OrderedDict() for k, v in weights_init.items(): # it should add at the end, but i am not sure of that so i am calling it manually weights_temp[k] = ( 1 - freq) * weights_init[k] + freq * weights_final[k] weights_temp.move_to_end(k) print("freq: {}".format(freq)) results_step = calculate_acc(weights_temp, meta_data, config, train, valid, test) results['freq'].append(freq) for k, v in results_step.items(): results[k].append(v) return results
def main(): args = vars(parser.parse_args()) check_args(args) set_seeds(2020) model_cfg = config.ModelConfig(args["model_config"]) run_cfg = config.RunConfig(args["run_config"], eval=True, sanity_check=args["sanity_check"]) output, writer, save_prefix = set_output(args, "eval_wrn_log") os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[ "device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available( ) else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, [model_cfg, run_cfg], device, output) ## Loading datasets start = Print(" ".join(['start loading datasets:', args["dataset"]]), output) dataset_test, dataset_info = get_dataset(args["dataset"], test=True, sanity_check=args["sanity_check"]) iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size_eval, shuffle=True, num_workers=2) end = Print( " ".join(['loaded', str(len(dataset_test)), 'dataset_test samples']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) model_cfg.set_num_channels_classes(dataset_info["num_channels"], dataset_info["num_classes"]) model_cfg.set_dropout_rate(run_cfg.dropout_rate) model = WideResNet(model_cfg) end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) if not data_parallel: model = model.to(device) else: model = nn.DataParallel(model.to(device)) criterion = nn.CrossEntropyLoss(reduction="none") run_cfg.set_adv(dataset_info, device) trainer = Trainer(model, criterion, run_cfg, std=True, adv=True, test=True) trainer.load(args["checkpoint"], save_prefix, device, output) end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## train a model start = Print('start evaluating a model', output) Print(trainer.get_headline(), output) ### test for B, batch in enumerate(iterator_test): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.std_evaluate(batch) trainer.adv_evaluate(batch) if B % 2 == 0: print('# test {:.1%}'.format(B / len(iterator_test)), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### print log and save models trainer.log(output, writer) end = Print('end evaluating a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) if not output == sys.stdout: output.close()
def train(save_path, model, datasets=['cifar10'], optimizer="SGD", data_seed=777, seed=777, batch_size=128, lr=0.0, wd=0.0, nesterov=False, checkpoint_monitor='val_categorical_accuracy:0', loss='ce', steps_per_epoch=-1, momentum=0.9, testing=False, testing_reload_best_val=True, callbacks=[]): np.random.seed(seed) # Create dataset generators (seeded) datasets = [ get_dataset(d, seed=data_seed, batch_size=batch_size) for d in datasets ] # Create model model = models.__dict__[model](input_shape=datasets[0][-1]['input_shape'], n_classes=datasets[0][-1]['num_classes']) logger.info("# of parameters " + str(sum([np.prod(p.shape) for p in model.trainable_weights]))) model.summary() if loss == 'ce': loss_function = tf.keras.losses.categorical_crossentropy else: raise NotImplementedError() if optimizer == "SGD": optimizer = SGD(learning_rate=lr, momentum=momentum, nesterov=nesterov) elif optimizer == "Adam": optimizer = Adam(learning_rate=lr) else: raise NotImplementedError() # Create callbacks callbacks_constructed = [] for name in callbacks: clbk = get_callback(name, verbose=0) if clbk is not None: callbacks_constructed.append(clbk) else: raise NotImplementedError(f"Did not find callback {name}") # Pass everything to the training loop metrics = [categorical_accuracy] if steps_per_epoch == -1: steps_per_epoch = (datasets[0][-1]['n_examples_train'] + batch_size - 1) // batch_size training_loop(model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, datasets=datasets, weight_decay=wd, save_path=save_path, config=_CONFIG, steps_per_epoch=steps_per_epoch, use_tb=True, checkpoint_monitor=checkpoint_monitor, custom_callbacks=callbacks_constructed, seed=seed) if testing: if testing_reload_best_val: model = restore_model(model, os.path.join(save_path, "model_best_val.h5")) m_val = evaluate(model, [datasets[0][1]], loss_function, metrics) m_test = evaluate(model, [datasets[0][2]], loss_function, metrics) logger.info("Saving") eval_results = {} for k in m_test: eval_results['test_' + k] = float(m_test[k]) for k in m_val: eval_results['val_' + k] = float(m_val[k]) logger.info(eval_results) json.dump(eval_results, open(os.path.join(save_path, "eval_results.json"), "w"))
def main(_): """Builds and trains a sentiment classification RNN.""" # prevent tf from accessing GPU tf.config.experimental.set_visible_devices([], "GPU") # Get and save config config = argparser.parse_args('main') logging.info(json.dumps(config, indent=2)) with uv.start_run( experiment_name=config['save']['mlflow_expname'], run_name=config['save']['mlflow_runname']), uv.active_reporter( MLFlowReporter()): reporters.save_config(config) uv.report_params(reporters.flatten(config)) prng_key = random.PRNGKey(config['run']['seed']) # Load data. vocab_size, train_dset, test_dset = data.get_dataset(config['data']) # Build network. cell = model_utils.get_cell(config['model']['cell_type'], num_units=config['model']['num_units']) init_fun, apply_fun, _, _ = network.build_rnn( vocab_size, config['model']['emb_size'], cell, config['model']['num_outputs']) loss_fun, acc_fun = optim_utils.loss_and_accuracy( apply_fun, config['model'], config['optim']) _, initial_params = init_fun( prng_key, (config['data']['batch_size'], config['data']['max_pad'])) initial_params = model_utils.initialize(initial_params, config['model']) # get optimizer opt, get_params, opt_state, step_fun = optim_utils.optimization_suite( initial_params, loss_fun, config['optim']) ## Scope setup # Reporter setup data_store = {} reporter = reporters.build_reporters(config['save'], data_store) # Static state for scope static_state = { 'acc_fun': acc_fun, 'loss_fun': loss_fun, 'param_extractor': get_params, 'test_set': test_dset } oscilloscope = m.MetricCallback(static_state) def interval_trigger(interval): def function_to_return(x): return x % interval == 0 return function_to_return oscilloscope.add_measurement({ 'name': 'test_acc', 'trigger': interval_trigger(config['save']['measure_test']), 'function': measurements.measure_test_acc }) oscilloscope.add_measurement({ 'name': 'shuffled_test_acc', 'trigger': interval_trigger(config['save']['measure_test']), 'function': measurements.measure_shuffled_acc }) oscilloscope.add_measurement({ 'name': 'train_acc', 'trigger': interval_trigger(config['save']['measure_train']), 'function': measurements.measure_batch_acc }) oscilloscope.add_measurement({ 'name': 'train_loss', 'trigger': interval_trigger(config['save']['measure_train']), 'function': measurements.measure_batch_loss }) oscilloscope.add_measurement({ 'name': 'l2_norm', 'trigger': interval_trigger(config['save']['measure_test']), 'function': measurements.measure_l2_norm }) # Train global_step = 0 loss = np.nan for epoch in range(config['optim']['num_epochs']): for batch_num, batch in enumerate(tfds.as_numpy(train_dset)): dynamic_state = { 'opt_state': opt_state, 'batch_train_loss': loss, 'batch': batch } step_measurements = oscilloscope.measure( int(global_step), dynamic_state) if step_measurements is not None: reporter.report_all(int(global_step), step_measurements) global_step, opt_state, loss = step_fun( global_step, opt_state, batch) if global_step % config['save']['checkpoint_interval'] == 0: params = get_params(opt_state) np_params = np.asarray(params, dtype=object) reporters.save_dict(config, np_params, f'checkpoint_{global_step}') final_measurements = oscilloscope.measure( int(global_step), dynamic_state, measurement_list=['test_acc', 'shuffled_test_acc']) reporter.report_all(int(global_step), final_measurements) final_params = { 'params': np.asarray(get_params(opt_state), dtype=object) } reporters.save_dict(config, final_params, 'final_params')
def main(_): BASE_FOLDER = f'results/yelp/jointsweep/{FLAGS.epochs}Epochs/{FLAGS.arch}_eta_{FLAGS.eta}_L2_{FLAGS.l2}_*' data_folder = glob.glob(BASE_FOLDER) assert len(data_folder) == 1 data_folder = data_folder[0] with open(os.path.join(data_folder, 'config.json')) as f: config = json.load(f) with open(os.path.join(data_folder, 'test_acc.jsonl')) as f: x = json_lines.reader(f) print("Non shuffled acc (recorded):") print(list(x)[-1]['value']) vocab_size, train_dset, test_dset = data.get_dataset(config['data']) cell = model_utils.get_cell(config['model']['cell_type'], num_units=config['model']['num_units']) init_fun, apply_fun, emb_apply, readout_apply = network.build_rnn( vocab_size, config['model']['emb_size'], cell, num_outputs=config['model']['num_outputs']) emb_init, emb_apply = renn.embedding(vocab_size, config['model']['emb_size']) network_params = model_utils.load_params( os.path.join(data_folder, 'final_params')) emb_params, rnn_params, readout_params = network_params print("Loaded model and dataset") test_acc = measurements.AverageMeter() for i, batch in enumerate(tfds.as_numpy(test_dset)): if FLAGS.shuffle: batch = au.shuffle_words(batch) batch_final_states = au.rnn_end_states(cell, batch, rnn_params, emb_params, emb_apply) print(i) """ logits = readout_apply(readout_params, np.vstack(batch_final_states)) predictions = np.argmax(logits, axis=1) curr_acc = np.mean(predictions == batch['labels']) test_acc.update(curr_acc, len(batch['index'])) print(i, len(batch['index'])) del batch_final_states del logits del predictions del batch """ #if i > 85: # break if FLAGS.shuffle: print("Shuffled accuracy") else: print("Non-shuffled accuracy") print(test_acc.avg)