def eval_joint_model(model, model_params): # get test data for evaluation data, metadata = gather('10k', 0) # build data format dformat = ['contexts', 'questions', 'answers'] accuracies = [] with tf.Session() as sess: # reload model params sess.run(set_op(model_params)) # build trainer trainer = Trainer(sess, model, batch_size=128) for i in range(1, 21): # test feed for task 'i' testfeed = DataFeed(dformat, data=data['test'][i]) # evaluate loss, acc = trainer.evaluate(feed=testfeed) # note down task accuracy accuracies.append(acc) print('\n:: Evaluation on individual tasks\n:: Accuracy') for i, acc in enumerate(accuracies): print(':: \t[{}] {}'.format(i, acc))
def main(): logger.info("Load Config") data_and_support = CortexEpfl() cfg = config.load_config(data_and_support.name) logger.info("Initialize Experiment") trial_path, trial_id, log_msg = init(cfg) logger.info('Experiment ID: {}, Trial ID: {}, GPU: {}'.format(cfg.experiment_idx, trial_id, GPU_index)) logger.info("Network config") model_config = NetworkConfig(cfg.step_count, cfg.first_layer_channels, cfg.num_classes, cfg.num_input_channel, True, cfg.ndims, 'same', trial_id, cfg.batch_size, cfg) logger.info("Create network") classifier = network(model_config) classifier.cuda() logger.info("Load data") cfg.patch_shape = model_config.in_out_shape(cfg.hint_patch_shape) data = data_and_support.load_data(cfg) loader = DataLoader(data[DataModes.TRAINING], batch_size=classifier.config.batch_size, shuffle=True) logger.info("Trainset length: {}".format(loader.__len__())) logger.info("Initialize optimizer") optimizer = optim.Adam(filter(lambda p: p.requires_grad, classifier.parameters()), lr=cfg.learning_rate) logger.info("Initialize evaluator") evaluator = Evaluator(classifier, optimizer, data, trial_path, cfg, data_and_support, cfg.train_mode) logger.info("Initialize trainer") trainer = Trainer(classifier, loader, optimizer, cfg.numb_of_epochs, cfg.eval_every, trial_path, evaluator, log_msg) trainer.train()
def main(): CUDA_OK = torch.cuda.is_available() args = parse() dl = DataLoader() train_iter, valid_iter = dl.load_translation( data_path=args.data_path, exts=('.' + args.src, '.' + args.tgt), # ('.zh', '.en') batch_size=args.batch_size, dl_save_path=args.dl_path) args.n_src_words, args.n_tgt_words = len(dl.SRC.vocab), len(dl.TGT.vocab) args.src_pdx, args.tgt_pdx = dl.src_padding_index, dl.tgt_padding_index print(args) model = build_model(args, cuda_ok=CUDA_OK) trainer = Trainer(args, model=model, optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9), criterion=nn.CrossEntropyLoss(ignore_index=args.tgt_pdx, reduction='mean'), cuda_ok=CUDA_OK) trainer.train(train_iter, valid_iter, n_epochs=args.n_epochs, save_path=args.ckpt_path)
def main(model_name): # yaml 로드 config = yaml.load(open("./config/" + str(model_name) + ".yaml", "r"), Loader=yaml.FullLoader) trainset = MyTrainSetWrapper(**config['train']) # Trainer 클래스 초기화. train 실행. downstream = Trainer(trainset, model_name, config) downstream.train()
def main(): manager = Manager.init() models = [["model", MobileNetV2(**manager.args.model)]] manager.init_model(models) args = manager.args criterion = Criterion() optimizer, scheduler = Optimizer(models, args.optim).init() args.cuda = args.cuda and torch.cuda.is_available() if args.cuda: for item in models: item[1].cuda() criterion.cuda() dataloader = DataLoader(args.dataloader, args.cuda) summary = manager.init_summary() trainer = Trainer(models, criterion, optimizer, scheduler, dataloader, summary, args.cuda) for epoch in range(args.runtime.start_epoch, args.runtime.num_epochs + args.runtime.start_epoch): try: print("epoch {}...".format(epoch)) trainer.train(epoch) manager.save_checkpoint(models, epoch) if (epoch + 1) % args.runtime.test_every == 0: trainer.validate() except KeyboardInterrupt: print("Training had been Interrupted\n") break trainer.test()
def main(): args = args_parser() if args.task == 'train': # conll process data_vocab_class, processor_class, conll_config_path = dataset_name_to_class[args.dataset] conll_configs = config_loader(conll_config_path) if not os.path.exists(os.path.join(conll_configs['data_path'], 'train.txt')): data_vocab = data_vocab_class(conll_configs) conll_to_train_test_dev(conll_configs['label_file'], conll_configs['data_path']) # config configs = config_loader(args.config_path) configs['data_dir'] = os.path.join(configs['data_dir'], args.dataset.lower()) configs['finetune_model_dir'] = os.path.join(configs['finetune_model_dir'], args.dataset.lower()) configs['output_dir'] = os.path.join(configs['output_dir'], args.dataset.lower()) check_dir(configs['data_dir']) check_dir(configs['finetune_model_dir']) check_dir(configs['output_dir']) # train processor = processor_class() for model_class in configs['model_class']: print('Begin Training %s Model on corpus %s' %(model_class, args.dataset)) trainer = Trainer(configs, model_class, processor) trainer.train() if args.task == 'eval': data_vocab_class, processor_class, conll_config_path = dataset_name_to_class[args.dataset] conll_configs = config_loader(conll_config_path) if not os.path.exists(os.path.join(conll_configs['data_path'], 'test.txt')): data_vocab = data_vocab_class(conll_configs) conll_to_train_test_dev(conll_configs['label_file'], conll_configs['data_path']) configs = config_loader(args.config_path) configs['data_dir'] = os.path.join(configs['data_dir'], args.dataset.lower()) configs['finetune_model_dir'] = os.path.join(configs['finetune_model_dir'], args.dataset.lower()) configs['output_dir'] = os.path.join(configs['output_dir'], args.dataset.lower()) check_dir(configs['data_dir']) check_dir(configs['finetune_model_dir']) check_dir(configs['output_dir']) processor = processor_class() for model_class in configs['model_class']: print('Begin Evaluate %s Model on corpus %s' %(model_class, args.dataset)) predicter = Predictor(configs, model_class, processor) predicter.eval()
def _train(config): VALIDATION_SIZE = 5000 # Size of the validation set. BATCH_SIZE = 64 train_dataset = Dataset("mnist", 'train') test_dataset = Dataset("mnist", 'test') validation_dataset = train_dataset.split_validation(VALIDATION_SIZE) model = TutorialCNN(config) trainer = Trainer(config, model) trainer.set_trainer(optimizer='Momentum', training_size=train_dataset.size) # Create a local session to run the training. start_time = time.time() with tf.Session() as sess: # Run all the initializers to prepare the trainable parameters. tf.global_variables_initializer().run() print('Initialized!') for batch in train_dataset.get_batches(BATCH_SIZE, num_epoches=config.num_epochs): step = sess.run(trainer.global_step) feed_dict = model.get_feed_dict(batch, is_train=True, supervised=True) sess.run(trainer.train_op, feed_dict=feed_dict) if step % config.eval_period == 0: l, lr, acc = sess.run( [model.loss, trainer.learning_rate, model.accuracy], feed_dict=feed_dict) elapsed_time = time.time() - start_time start_time = time.time() print('Step %d, %.1f ms' % (step, 1000 * elapsed_time / config.eval_period)) print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) print('Minibatch error: %.1f%%' % (100. - 100. * acc, )) print('Validation error: %.1f%%' % eval_error_in_dataset( validation_dataset, model, sess, config)) sys.stdout.flush() test_error = eval_error_in_dataset(test_dataset, model, sess, config) print('Test error: %.1f%%' % test_error)
def run(model_cls, loss, predictor, acc_calc, train_dataset, valid_dataset, sps): # log setting alias = generate_alias(model_cls, task='Age') msg = generate_file_msg(sps, loss, predictor, acc_calc) tb_log_path = os.path.join('runs', alias) save_dir = os.path.join('models', alias) logger_alias = alias config = Config(epoch_num=sps['epoch_num'], momentum=sps['momentum'], weight_decay=sps['weight_decay'], learning_rates=sps['learning_rates'], decay_points=sps['decay_points'], batch_size=sps['batch_size'], parameters_func=parameters_func, tb_log_path=tb_log_path, save_dir=save_dir, pretrain=sps['pretrain'], pretrained_model_dir=sps['pretrained_model_dir'], load_function=sps['load_function'], logger_alias=logger_alias, gpu_id=gpu_id) logger = Logger() logger.open_file(os.path.join('log'), alias=alias, file_name=alias + '.txt', file_msg=msg) trainer = Trainer(model_cls=model_cls, loss=loss, predictor=predictor, calculator=calculator, train_dataset=train_dataset, val_dataset=valid_dataset, config=config, logger=logger) trainer.train() logger.close_file(alias)
def train(data,config): id2label=data['id2label'] label_size=data['label_size'] config['num_labels']=label_size model=Classifier(config)(num_labels=label_size) optimizer = Optim(config.optim, config.learning_rate, config.max_grad_norm, lr_decay=config.learning_rate_decay, start_decay_at=config.start_decay_at) optimizer.set_parameters(model.parameters()) if config.classifier=='BertCNN' or config.classifier=='BertRCNN' or config.classifier=='BertDPCNN' or config.classifier=='BertFC': trainer = Trainer(config, model=model, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, early_stopping=None, epoch_metrics=[AUC(average='micro', task_type='binary'),MultiLabelReport(id2label=id2label),F1Score(average='micro')]) elif config.classifier=='BertSGM' or config.classifier=='SGM': criterion = nn.CrossEntropyLoss(ignore_index=dict_helper.PAD, reduction='none') if config.n_gpu!='': criterion.cuda() trainer = Trainer(config, model=model, logger=logger, criterion=criterion, optimizer=optimizer, early_stopping=None, epoch_metrics=[AUC(average='micro', task_type='binary'), F1Score(average='micro')]) elif config.classifier=='BertSeq2Set': trainer = Trainer(config, model=model, logger=logger, criterion=None, optimizer=optimizer, early_stopping=None, epoch_metrics=[AUC(average='micro', task_type='binary'),F1Score(average='micro')]) trainer.train(data=data,seed=config.seed)
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" self.print_flags_info() if flags.segnet == -1: with open(flags.segnet_config) as f: self.config = json.load(f) self.num_classes = self.config["NUM_CLASSES"] self.use_vgg = self.config["USE_VGG"] if self.use_vgg is False: self.vgg_param_dict = None print("No VGG path in config, so learning from scratch") else: self.vgg16_npy_path = self.config["VGG_FILE"] self.vgg_param_dict = np.load(self.vgg16_npy_path, encoding='latin1').item() print("VGG parameter loaded") self.bayes = self.config["BAYES"] segnet_param_dict = {'segnet_mode': flags.segnet, 'vgg_param_dict': self.vgg_param_dict, 'use_vgg': self.use_vgg, 'num_classes': self.num_classes, 'bayes': self.bayes} else: # 0, 1, 2, 3 segnet_param_dict = {'segnet_mode': flags.segnet} if flags.env_type != 'indoor': env_config = {} else: env_config = sim_config.get(flags.env_name) self.image_shape = [env_config.get('height', 84), env_config.get('width', 84)] self.map_file = env_config.get('objecttypes_file', '../../objectTypes_1x.csv') initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_requested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) objective_size = Environment.get_objective_size(flags.env_type, flags.env_name) is_training = tf.placeholder(tf.bool, name="training") self.random_state = np.random.RandomState(seed=env_config.get("seed", 0xA3C)) print("Global network initializing!")#, flush=True) self.global_network = UnrealModel(action_size, objective_size, -1, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes=flags.n_classes, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = flags.rmsp_alpha, momentum = 0.0, epsilon = flags.rmsp_epsilon, clip_norm = flags.grad_norm_clip, device = device) for i in range(flags.parallel_size): trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_lstm, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.n_step_TD, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device, segnet_param_dict=segnet_param_dict, image_shape=self.image_shape, is_training=is_training, n_classes = flags.n_classes, random_state=self.random_state, termination_time=flags.termination_time_sec, segnet_lambda=flags.segnet_lambda, dropout=flags.dropout) self.trainers.append(trainer) self.last_scores = [] self.best_score = -1.0 # prepare session config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = False) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Wrap sess.run for debugging messages! def run_(*args, **kwargs): #print(">>> RUN!", args[0] if args else None)#, flush=True) return self.sess.__run(*args, **kwargs) # getattr(self, "__run")(self, *args, **kwargs) self.sess.__run, self.sess.run = self.sess.run, run_ self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.float32) self.sr_input = tf.placeholder(tf.float32) self.mIoU_input = tf.placeholder(tf.float32) self.term_global_t = tf.placeholder(tf.int32) self.losses_input = {} self.total_loss = tf.placeholder(tf.float32, name='total_loss') self.losses_input.update({'all/total_loss': self.total_loss}) self.base_loss = tf.placeholder(tf.float32, name='base_loss') self.losses_input.update({'all/base_loss': self.base_loss}) self.policy_loss = tf.placeholder(tf.float32, name='policy_loss') self.losses_input.update({'all/policy_loss': self.policy_loss}) self.value_loss = tf.placeholder(tf.float32, name='policy_loss') self.losses_input.update({'all/value_loss': self.value_loss}) self.grad_norm = tf.placeholder(tf.float32, name='grad_norm') self.losses_input.update({'all/loss/grad_norm': self.grad_norm}) self.entropy_input = tf.placeholder(tf.float32, shape=[None], name='entropy') if segnet_param_dict["segnet_mode"] >= 2: self.decoder_loss = tf.placeholder(tf.float32, name='decoder_loss') self.losses_input.update({'all/decoder_loss': self.decoder_loss}) self.l2_weights_loss = tf.placeholder(tf.float32, name='regul_weights_loss') self.losses_input.update({'all/l2_weights_loss': self.l2_weights_loss}) if flags.use_pixel_change: self.pc_loss = tf.placeholder(tf.float32, name='pc_loss') self.losses_input.update({'all/pc_loss': self.pc_loss}) if flags.use_value_replay: self.vr_loss = tf.placeholder(tf.float32, name='vr_loss') self.losses_input.update({'all/vr_loss': self.vr_loss}) if flags.use_reward_prediction: self.rp_loss = tf.placeholder(tf.float32, name='rp_loss') self.losses_input.update({'all/rp_loss': self.rp_loss}) score_summary = tf.summary.scalar("all/eval/score", self.score_input) sr_summary = tf.summary.scalar("all/eval/success_rate", self.sr_input) term_summary = tf.summary.scalar("all/eval/term_global_t", self.term_global_t) eval_summary = tf.summary.scalar("all/eval/mIoU_all", self.mIoU_input) losses_summary_list = [] for key, val in self.losses_input.items(): losses_summary_list += [tf.summary.scalar(key, val)] self.summary_op_dict = {'score_input': score_summary, 'eval_input': eval_summary, 'sr_input':sr_summary, 'losses_input': tf.summary.merge(losses_summary_list), 'entropy': tf.summary.scalar('all/eval/entropy_stepTD', tf.reduce_mean(self.entropy_input)), 'term_global_t': term_summary} flags.checkpoint_dir = os.path.join(base_dir, flags.checkpoint_dir) #print("First dirs {}::{}".format(flags.log_dir, flags.checkpoint_dir)) flags.checkpoint_dir = flags.checkpoint_dir print("Checkpoint dir: {}, Log dir: {}".format(flags.checkpoint_dir, flags.log_dir)) overall_FW = tf.summary.FileWriter(os.path.join(flags.log_dir, 'overall'), self.sess.graph) self.summary_writer = [(tf.summary.FileWriter(os.path.join(flags.log_dir, 'worker_{}'.format(i)), self.sess.graph), overall_FW) for i in range(flags.parallel_size)] # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_global_vars(), max_to_keep=20) #checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir, latest_filename ="best-checkpoint") #if checkpoint is None or checkpoint.model_checkpoint_path is None: # checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: if flags.segnet == -1: from tensorflow.python import pywrap_tensorflow reader = pywrap_tensorflow.NewCheckpointReader(checkpoint.model_checkpoint_path) big_var_to_shape_map = reader.get_variable_to_shape_map() s = [] for key in big_var_to_shape_map: s += [key] # print("tensor_name: ", key) glob_var_names = [v.name for v in tf.global_variables()] endings = [r.split('/')[-1][:-2] for r in glob_var_names] old_ckpt_to_new_ckpt = {[k for k in s if endings[i] in k][0]: v for i, v in enumerate(tf.global_variables())} saver1 = tf.train.Saver(var_list=old_ckpt_to_new_ckpt) saver1.restore(self.sess, checkpoint.model_checkpoint_path) else: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step if 'best' in checkpoint.model_checkpoint_path: files = os.listdir(flags.checkpoint_dir) max_g_step = 0 max_best_score = -10 for file in files: if '.meta' not in file or 'checkpoint' not in file: continue if len(tokens) == 2: continue if len(tokens) > 3: best_score = float('-0.'+file.split('-')[2]) if 'best' in file else float('-0.'+file.split('-')[1]) if best_score > max_best_score: g_step = int(file.split('-')[3]).split('.')[0] if 'best' in file else int(file.split('-')[2].split('.')[0]) if max_g_step < g_step: max_g_step = g_step else: self.best_score = -1.0 g_step = int(file.split('-')[2]) if 'best' in file else int(file.split('-')[1]) if max_g_step < g_step: max_g_step = g_step self.best_score = max_best_score self.global_t = max_g_step print("Chosen g_step >>", g_step) else: if len(tokens) == 3: self.global_t = int(tokens[2]) else: self.global_t = int(tokens[1]) #for i in range(flags.parallel_size): # self.trainers[i].local_t = self.global_t print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str(self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = (self.global_t + flags.save_interval_step) // flags.save_interval_step * flags.save_interval_step print_tensors_in_checkpoint_file(file_name=checkpoint.model_checkpoint_path, tensor_name='', all_tensors=False, all_tensor_names=True) else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step print("Global step {}, max best score {}".format(self.global_t, self.best_score)) if flags.segnet_pretrain: checkpoint_dir = "../erfnet_segmentation/models" checkpoint_dir = os.path.join(checkpoint_dir, "aug_erfnetC_0_{}x{}_{}x/snapshots_best".format( self.image_shape[1], self.image_shape[0], self.map_file.split('_')[1].split('x')[0])) checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) big_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_encoder') big_weights += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net_-1/base_decoder') erfnet_weights = [l.name.split(':')[0].rsplit('net_-1/base_encoder/')[-1] for l in big_weights if len(l.name.split(':')[0].rsplit('net_-1/base_encoder/')) == 2] erfnet_weights += [l.name.split(':')[0].rsplit('net_-1/base_decoder/')[-1] for l in big_weights if len(l.name.split(':')[0].rsplit('net_-1/base_decoder/')) == 2] if checkpoint and checkpoint.model_checkpoint_path: saver2 = tf.train.Saver(var_list=dict(zip(erfnet_weights, big_weights))) saver2.restore(self.sess, checkpoint.model_checkpoint_path) print("ErfNet pretrained weights restored from file ", checkpoint_dir) else: print("Can't load pretrained weights for ErfNet from file ", checkpoint_dir) # run training threads self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append(threading.Thread(target=self.train_function, args=(i,True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t print("Ready to start") for t in self.train_threads: t.start() print('Press Ctrl+C to stop')#, flush=True) signal.pause()
def main(_): options = get_options_from_flags() maybe_download_data_files_from_s3(options) Trainer(options).train()
from utils.dataloader import DataLoader import torch from model import model_utils from optimizer.optimizer import NoamOpt from train.trainer import Trainer hidden_size = 256 num_encoder = 6 num_decoder = 6 n_head = 8 pf_dim = 1024 drop_out = 0.5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device = 'cpu' dataloader = DataLoader(device) train_iterator, valid_iterator, test_iterator = dataloader.load_data(64) model = model_utils.create_model(dataloader.src_vocab_size(), dataloader.trg_vocab_size(), hidden_size, num_encoder, num_decoder, n_head, pf_dim, drop_out, dataloader.get_pad_idx(), device) print(model_utils.count_parameters(model)) model_utils.init(model) optimizer = NoamOpt(hidden_size , 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) trainer = Trainer(train_iterator, valid_iterator, model, optimizer, dataloader.get_pad_idx(), device) trainer.train(5) # for i, batch in enumerate(train_iterator): # src = batch.src.permute(1, 0).to(device) # trg = batch.trg.permute(1, 0).to(device)
def run(self): device = "/cpu:0" if USE_GPU: device = "/gpu:0" initial_learning_rate = log_uniform(flags.initial_alpha_low, flags.initial_alpha_high, flags.initial_alpha_log_rate) self.global_t = 0 self.stop_requested = False self.terminate_reqested = False action_size = Environment.get_action_size(flags.env_type, flags.env_name) self.global_network = UnrealModel(action_size, -1, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, device) self.trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=flags.rmsp_alpha, momentum=0.0, epsilon=flags.rmsp_epsilon, clip_norm=flags.grad_norm_clip, device=device) for i in range( flags.parallel_size): #Trainer creates a UnrealModel in init trainer = Trainer(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, flags.env_type, flags.env_name, flags.use_pixel_change, flags.use_value_replay, flags.use_reward_prediction, flags.pixel_change_lambda, flags.entropy_beta, flags.local_t_max, flags.gamma, flags.gamma_pc, flags.experience_history_size, flags.max_time_step, device) self.trainers.append(trainer) # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) # summary for tensorboard self.score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", self.score_input) self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(flags.log_file, self.sess.graph) # init or load checkpoint with saver self.saver = tf.train.Saver(self.global_network.get_vars()) checkpoint = tf.train.get_checkpoint_state(flags.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) # set wall time wall_t_fname = flags.checkpoint_dir + '/' + 'wall_t.' + str( self.global_t) with open(wall_t_fname, 'r') as f: self.wall_t = float(f.read()) self.next_save_steps = ( self.global_t + flags.save_interval_step ) // flags.save_interval_step * flags.save_interval_step else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 self.next_save_steps = flags.save_interval_step # run training threads ## Each Env is Running Here Parallel self.train_threads = [] for i in range(flags.parallel_size): self.train_threads.append( threading.Thread(target=self.train_function, args=(i, True))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t for t in self.train_threads: t.start() print('Press Ctrl+C to stop') signal.pause()
def setup_trainer(config, name, datasets, monitor="val", pretrained=None, finetune=False, label_transformer=None, disable_cache=False): """ Prepare everything needed for a train + validation typical pipeline. Args: config: dict, experiment parameters name: str, name of the experiment datasets: dict, data for every data partition (X, y) monitor: str, partition to watch on learning time pretrained: str, path to pretrained model and conf files finetune: bool, whether to finetune pretrained model label_transformer: Label transform function disable_cache: Whether to activate caching (TODO) Return: a Trainer object """ pretrained_model = None pretrained_config = None if pretrained: pretrained_model, pretrained_config = get_pretrained(pretrained) word2idx = None embeddings = None if config["embeddings_file"]: word2idx, idx2word, embeddings = load_embeddings(config) preprocessor = config["preprocessor"] try: preprocessor = getattr(preproc, preprocessor) except TypeError: preprocessor = preproc.dummy_preprocess loaders = get_dataloaders(datasets, batch_size=config["batch_size"], data_type=config["data_type"], name=name, preprocessor=preprocessor(), label_transformer=label_transformer, word2idx=word2idx, config=config) output_size, task = get_output_size(loaders["train"].dataset.labels) weights = None if task != "regression": weights = class_weights(loaders["train"].dataset.labels, to_pytorch=True).to(DEVICE) if embeddings is None: model = MonoModalModel( out_size=output_size, embeddings=embeddings, embed_dim=config["embeddings_size"], pretrained=pretrained_model, finetune=finetune, encoder_params=config["encoder_params"], attention_params=config["attention_params"]).to(DEVICE) else: model = MonoModalModel( out_size=output_size, embeddings=embeddings, pretrained=pretrained_model, finetune=finetune, embed_params=config["embeddings_params"], encoder_params=config["encoder_params"], attention_params=config["attention_params"]).to(DEVICE) criterion = get_criterion(task, weights) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = get_optimizer(parameters, lr=config["lr"], weight_decay=config["weight_decay"]) pipeline = get_pipeline(task, criterion) metrics, monitor_metric, mode = get_metrics(task) model_dir = None if pretrained: model_dir = os.path.join(TRAINED_PATH, "TL") checkpoint = Checkpoint(name=name, model=model, model_conf=config, monitor=monitor, keep_best=True, timestamp=True, scorestamp=True, metric=monitor_metric, mode=mode, base=config["base"], model_dir=model_dir) early_stopping = EarlyStop(metric=monitor_metric, mode=mode, monitor=monitor, patience=config["patience"], min_change=config["min_change"]) return Trainer(model=model, loaders=loaders, task=task, config=config, optimizer=optimizer, pipeline=pipeline, metrics=metrics, checkpoint=checkpoint, early_stopping=early_stopping)
from __future__ import print_function, division from skimage import transform import matplotlib.pyplot as plt from torch.utils.data import DataLoader from torchvision import transforms from dataset.screendataset import ScreenDataset as ScreenDataset from dataset.transforms import Rescale from dataset.transforms import ToTensor from train.trainer import Trainer def show_landmarks(screendataset): plt.imshow(screendataset.__getitem__(0)['image']) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--load', type=bool, default=False) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--max', type=int, default=100000) parser.add_argument('--epochs', type=int, default=5) args = parser.parse_args() print("load=", args.load, "lr=", args.lr, "max=", args.max, "epochs=", args.epochs) trainer = Trainer(args.load, args.lr, args.epochs, args.max) trainer.train_model()
n=1, optimizer=tf.train.AdamOptimizer, lr=0.001, vocab_size=metadata['vocab_size'], max_candidates=metadata['max_candidates'], demb=384, dhdim=384, num_layers=1) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: # init session sess.run(tf.global_variables_initializer()) # create trainer trainer = Trainer(sess, model, trainfeed, testfeed, batch_size=batch_size) # train acc = trainer.fit(epochs=1000000, eval_interval=1, mode=Trainer.TRAIN, verbose=True, lr=0.001) print(':: \tAccuracy after training: ', acc)
"""Run script Author: Alaaeldin Ali""" from train.trainer import Trainer import argparse parser = argparse.ArgumentParser() parser.add_argument('--lr', default=2.5e-4) parser.add_argument('--vis_screen', default='Relnet') parser.add_argument('--save_path', default=None) parser.add_argument('-warmup', action='store_true') parser.add_argument('--batch_size', default=64) args = parser.parse_args() trainer = Trainer(lr=args.lr, screen=args.screen, batch_size=args.batch_size, save_path=args.save_path, warmup=args.warmup) trainer.train()
# make 'n' copies of model for data parallelism make_parallel(model, num_copies=4, num_gpus=4) # setup visualizer # by default, writes to ./log/ vis = Visualizer(interval=50) vis.attach_scalars(model) vis.attach_params() # histograms of trainable variables # create data source (SQuAD) datasrc = DataSource(batch_size, glove_file='../../../datasets/glove/glove.6B.300d.txt', random_x=0.2) # gpu config config = tf.ConfigProto() #config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # init session sess.run(tf.global_variables_initializer()) vis.attach_graph(sess.graph) # init trainer trainer = Trainer(sess, model, datasrc, batch_size, rand=True) # fit model trainer.fit(epochs=1000, visualizer=vis)
def train(args): ########### data ########### processor = Postprocessor( config["postprocessor"])(do_lower_case=args.do_lower_case) label_list = processor.get_labels(config['data_dir'] / "labels.txt") id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / "{}.train.pkl".format(args.data_name)) train_examples = processor.create_examples( lines=train_data, example_type='train', cached_examples_file=config["data_dir"] / "cached_train_examples_{}".format(args.pretrain)) train_features = processor.create_features( examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config["data_dir"] / "cached_train_features_{}_{}".format(args.train_max_seq_len, args.pretrain)) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config["data_dir"] / "{}.valid.pkl".format(args.data_name)) valid_examples = processor.create_examples( lines=valid_data, example_type='valid', cached_examples_file=config["data_dir"] / "cached_valid_examples_{}".format(args.pretrain)) valid_features = processor.create_features( examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config["data_dir"] / "cached_valid_features_{}_{}".format(args.eval_max_seq_len, args.pretrain)) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) if config["pretrain"] == "Nopretrain": config["vocab_size"] = processor.vocab_size ########### model ########### logger.info("========= initializing model =========") if args.resume_path: resume_path = Path(args.resume_path) model = Classifier(config["classifier"], config["pretrain"], resume_path)(num_labels=len(label_list)) else: model = Classifier(config["classifier"], config["pretrain"], "")(num_labels=len(label_list)) t_total = int( len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError as e: raise ImportError( "Please install apex github.com/nvidia/apex to use fp16.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) ########### callback ########### logger.info("========= initializing callbacks =========") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.pretrain) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.pretrain, save_best_only=args.save_best) ########### train ########### logger.info("========= Running training =========") logger.info(" Num examples = {}".format(len(train_examples))) logger.info(" Num Epochs = {}".format(args.epochs)) logger.info(" Total train batch size \ (w. parallel, distributed & accumulation) = {}".format( args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))) logger.info(" Gradient Accumulation steps = {}".format( args.gradient_accumulation_steps)) logger.info(" Total optimization steps = {}".format(t_total)) trainer = Trainer( n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=BCEWithLogLoss(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, batch_metrics=[AccuracyThresh(thresh=0.5)], epoch_metrics=[ AUC(average='micro', task_type='binary'), MultiLabelReport(id2label=id2label) ]) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
print(X_train.shape) y_train = to_one_hot(y_train) test_size = 10000 #test_mask = np.random.choice(X_train.shape[0], test_size) X_test = X_train[0:test_size] y_test = y_train[0:test_size] X_train = X_train[test_size:] y_train = y_train[test_size:] network = DeepConvNet() trainer = Trainer(network, X_train, y_train, X_test, y_test, epochs=100, mini_batch_size=100, optimizer='adagrad', optimizer_param={'lr': 0.01}, evaluate_sample_num_per_epoch=1000) trainer.train() y_pred = network.predict(X_pred) print(y_pred.shape) y_pred = one_hot_to(y_pred) submission = pd.DataFrame(y_pred, columns=["label"]) submission.index += 1 submission.to_csv("submission.csv", index_label='id')
from config import cifar_configs from train.trainer import Trainer from data.data_handler import DataHandler if __name__ == "__main__": # parser = argparse.ArgumentParser() # parser.add_argument("--config-file", default = "./config/train_config.yaml", metavar = "FILE", type = str) # args = parser.parse_args() # #extract config # config_file = open(args.config_file, 'r') # configs = yaml.load(config_file) datahandler = DataHandler(cifar_configs) trainer = Trainer(cifar_configs, datahandler) trainer.train()
monitor="val_loss", save_best_only=False, best_model_name=BEST_MODEL_NAME, epoch_model_name=EPOCH_MODEL_NAME, arch=ARCH, logger=logger, ) trainer = Trainer( model=model, train_loader=train_loader, valid_loader=valid_loader, optimizer=optimizer, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS, device=device, n_gpus=len(MULTI_GPUS), criterion=TripletLoss_op(), fts_flag=False, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, model_checkpoint=model_checkpoint, logger=logger, resume=RESUME, ) trainer.summary() trainer.train() logger.info(f'Took {time_to_str((timer() - start_time), "sec")}') print("---------- Bert Eval ... ----------") start_time = timer()
def train_separate_all(dataset='1k'): batch_size = 64 task_max_acc = [] for task in range(1, 21): # get task 1 #task = 18 data, metadata = gather('1k', task) # gather info from metadata num_candidates = metadata['candidates']['vocab_size'] vocab_size = metadata['vocab_size'] memsize = metadata['clen'] sentence_size = metadata['slen'] qlen = metadata['qlen'] print(':: <task {}> memory size : {}'.format(task, memsize)) # build data format dformat = ['contexts', 'questions', 'answers'] # create feeds trainfeed = DataFeed(dformat, data=data['train']) testfeed = DataFeed(dformat, data=data['test']) # instantiate model model = MemoryNet(hdim=20, num_hops=3, memsize=memsize, sentence_size=sentence_size, qlen=qlen, vocab_size=vocab_size, num_candidates=num_candidates) with tf.Session() as sess: # run for multiple initializations i, accuracy = 0, [0.] while accuracy[-1] < 0.95 and i < 5: # init session sess.run(tf.global_variables_initializer()) # create trainer trainer = Trainer(sess, model, trainfeed, testfeed, batch_size=batch_size) print('\n:: <task {}> ({}) [1/2] Pretraining'.format(task, i)) # pretrain acc = trainer.fit(epochs=100000, eval_interval=1, mode=Trainer.PRETRAIN, verbose=False, batch_size=64, lr=0.0005) print(':: \tAccuracy after pretraining: ', acc) print('\n:: <task {}> ({}) [2/2] Training'.format(task, i)) # train acc = trainer.fit(epochs=1000000, eval_interval=10, mode=Trainer.TRAIN, verbose=False, batch_size=64, lr=0.0005) print(':: \tAccuracy after training: ', acc) # next iteration i += 1 # add accuracy to list accuracy.append(acc) print(acc) print('Experiment Results : ') for i, a in enumerate(accuracy[1:]): print(i, a) task_max_acc.append(max(accuracy)) print('____________________________________________') for i, acc in enumerate(task_max_acc): print('Task ({}) : {}'.format(i + 1, acc)) print('____________________________________________')
import argparse from train.trainer import Trainer from utils.base_utils import load_cfg parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='configs/lmcnet/lmcnet_sift_outdoor_test.yaml') flags = parser.parse_args() trainer = Trainer(load_cfg(flags.cfg)) trainer.run()
def train_separate(task, dataset='1k', iterations=1, batch_size=32): # get data for task data, metadata = gather(dataset, task) # build data format dformat = ['contexts', 'questions', 'answers'] # create feeds trainfeed = DataFeed(dformat, data=data['train']) testfeed = DataFeed(dformat, data=data['test']) hdim = 20 if task else 50 eval_interval = 100 if task else 10 batch_size = 32 if task else 128 # instantiate model model = MemoryNet(hdim=20, num_hops=3, memsize=metadata['clen'], sentence_size=metadata['slen'], qlen=metadata['qlen'], vocab_size=metadata['vocab_size'], num_candidates=metadata['candidates']['vocab_size']) # info print(':: <task {}> [0/2] Info') print(':: \t memory size : {}, #candidates : {}'.format( metadata['clen'], metadata['candidates']['vocab_size'])) with tf.Session() as sess: # run for multiple initializations i, accuracy, model_params = 0, [0.], [None] while accuracy[-1] < 0.95 and i < iterations: # init session sess.run(tf.global_variables_initializer()) # create trainer trainer = Trainer(sess, model, trainfeed, testfeed, batch_size=batch_size) print('\n:: <task {}> ({}) [1/2] Pretraining'.format(task, i)) # pretrain acc = trainer.fit(epochs=100000, eval_interval=1, mode=Trainer.PRETRAIN, verbose=False, lr=0.0005) print(':: \tAccuracy after pretraining: ', acc) print('\n:: <task {}> ({}) [2/2] Training'.format(task, i)) # train acc = trainer.fit(epochs=1000000, eval_interval=eval_interval, mode=Trainer.TRAIN, verbose=False, lr=0.0005) print(':: \tAccuracy after training: ', acc) # next iteration i += 1 # add accuracy to list accuracy.append(acc) model_params.append(sess.run(tf.trainable_variables())) print(acc) print(':: [x/x] End of training') print(':: Max accuracy :', max(accuracy)) # return model and best model params return model, model_params[accuracy.index(max(accuracy))]
def __init__(self, ms_config_p, dl_config_p, log_dir_root, log_config: LogConfig, num_workers, saver: Saver, restorer: TrainRestorer=None, sw_cls=vis.safe_summary_writer.SafeSummaryWriter): """ :param ms_config_p: Path to the multiscale config file, see README :param dl_config_p: Path to the dataloader config file, see README :param log_dir_root: All outputs (checkpoints, tensorboard) will be saved here. :param log_config: Instance of train.trainer.LogConfig, contains intervals. :param num_workers: Number of workers to use for DataLoading, see train.py :param saver: Saver instance to use. :param restorer: Instance of TrainRestorer, if we need to restore """ # Read configs # config_ms = config for the network (ms = multiscale) # config_dl = config for data loading (self.config_ms, self.config_dl), rel_paths = ft.unzip(map(config_parser.parse, [ms_config_p, dl_config_p])) # Update config_ms depending on global_config global_config.update_config(self.config_ms) # Create data loaders dl_train, dl_val = self._get_dataloaders(num_workers) # Create blueprint. A blueprint collects the network as well as the losses in one class, for easy reuse # during testing. self.blueprint = MultiscaleBlueprint(self.config_ms) print('Network:', self.blueprint.net) # Setup optimizer optim_cls = {'RMSprop': optim.RMSprop, 'Adam': optim.Adam, 'SGD': optim.SGD, }[self.config_ms.optim] net = self.blueprint.net self.optim = optim_cls(net.parameters(), self.config_ms.lr.initial, weight_decay=self.config_ms.weight_decay) # Calculate a rough estimate for time per batch (does not take into account that CUDA is async, # but good enought to get a feeling during training). self.time_accumulator = timer.TimeAccumulator() # Restore network if requested skip_to_itr = self.maybe_restore(restorer) if skip_to_itr is not None: # i.e., we have a restorer print('Skipping to {}...'.format(skip_to_itr)) # Create LR schedule to update parameters self.lr_schedule = lr_schedule.from_spec( self.config_ms.lr.schedule, self.config_ms.lr.initial, [self.optim], epoch_len=len(dl_train)) # --- All nn.Modules are setup --- print('-' * 80) # create log dir and summary writer self.log_dir = Trainer.get_log_dir(log_dir_root, rel_paths, restorer) self.log_date = logdir_helpers.log_date_from_log_dir(self.log_dir) self.ckpt_dir = os.path.join(self.log_dir, CKPTS_DIR_NAME) print(f'Checkpoints will be saved to {self.ckpt_dir}') saver.set_out_dir(self.ckpt_dir) # Create summary writer sw = sw_cls(self.log_dir) self.summarizer = vis.summarizable_module.Summarizer(sw) net.register_summarizer(self.summarizer) self.blueprint.register_summarizer(self.summarizer) # superclass setup super(MultiscaleTrainer, self).__init__(dl_train, dl_val, [self.optim], net, sw, max_epochs=self.config_dl.max_epochs, log_config=log_config, saver=saver, skip_to_itr=skip_to_itr)
trainers = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) for i in range(PARALLEL_SIZE): trainer = Trainer(i, global_network, initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device = device) trainers.append(trainer) # prepare session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() # init = tf.initialize_all_variables() sess.run(init) # summary for tensorboard
def train(): trainer = Trainer(flags.cfg) trainer.train()
def train_separate(task, dataset='1k', iterations=1, batch_size=128): # get data for task data, metadata = gather(dataset, task) # build data format dformat = ['contexts', 'questions', 'answers'] # create feeds trainfeed = DataFeed(dformat, data=data['train']) testfeed = DataFeed(dformat, data=data['test']) # instantiate model model = RelationNet(clen=metadata['clen'], qlen=metadata['qlen'], slen=metadata['slen'], vocab_size=metadata['vocab_size'], num_candidates=metadata['candidates']['vocab_size']) # info print(':: <task {}> [0/2] Info') print(':: \t memory size : {}, #candidates : {}'.format( metadata['clen'], metadata['candidates']['vocab_size'])) # create visualizer vis = Visualizer() vis.attach_scalars(model) with tf.Session() as sess: # run for multiple initializations i, accuracy, model_params = 0, [0.], [None] while accuracy[-1] < 0.95 and i < iterations: # init session sess.run(tf.global_variables_initializer()) # add graph to visualizer vis.attach_graph(sess.graph) # create trainer trainer = Trainer(sess, model, trainfeed, testfeed, batch_size=batch_size) print('\n:: <task {}> ({}) [1/1] Training'.format(task, i)) # train acc = trainer.fit(epochs=1000000, eval_interval=1, mode=Trainer.TRAIN, verbose=True, lr=0.0002) print(':: \tAccuracy after training: ', acc) # next iteration i += 1 # add accuracy to list accuracy.append(acc) model_params.append(sess.run(tf.trainable_variables())) print(acc) print(':: [x/x] End of training') print(':: Max accuracy :', max(accuracy)) # return model and model params return model, sess.run(tf.trainable_variables())
vis.attach_scalars(model) #vis.attach_params() # histograms of trainable variables # gpu config config = tf.ConfigProto() #config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # init session sess.run(tf.global_variables_initializer()) # add graph to visualizer vis.attach_graph(sess.graph) # init trainer trainer = Trainer(sess, model, datasrc, batch_size) # fit model trainer.fit(epochs=600, mode=Trainer.TRAIN, verbose=True, visualizer=vis, eval_interval=1, early_stop=False) ''' print('****************************************************************** PRETRAINING OVER ') for task_id in reversed(range(21)): datasrc.task_id = task_id loss, acc = trainer.evaluate() print('evaluation loss for task_id = {}\t\tloss = {}\t\t accuracy = {}'.format(task_id, loss, acc))