Пример #1
0
    def run_train(self, model, dataloaders, num_epochs=10, callbacks=None):
        if callbacks is None:
            callbacks = []
        model.to(self.device)
        earlystop = EarlyStopping(patience=20,
                                  verbose=True,
                                  checkpoint='checkpoint_effnet_large')
        tracker = MetricsTracker()
        batch_size = self.opt.batchSize

        for epoch in tqdm(range(num_epochs)):
            print('Epoch:', epoch)
            for phase in ['train', 'valid']:
                if phase == ' train':
                    model.train()
                else:
                    model.eval()
                running_loss = 0.0
                running_corrects = 0

                dataloader = iter(dataloaders[phase])
                for batch_idx, batch_data in tqdm(
                        enumerate(dataloader), leave=False,
                        total=len(dataloader)):

                    data, target = batch_data['data'], batch_data['class']
                    running_corrects_, running_loss_ = self.run_one_step(model,
                                                                         batch_data,
                                                                         phase)
                    accuracy = float(running_corrects) / ((batch_idx + 1) * batch_size)
                    metrics = dict(accuracy=accuracy,
                                   loss=running_loss_ / data.size(0))
                    if batch_idx % self.opt.display_count == 0:
                        for cb in callbacks:
                            cb.update_metrics(metrics, phase)

                    running_corrects += running_corrects_
                    running_loss += running_loss_

                epoch_acc = running_corrects.double() / (len(dataloaders[phase]) * batch_size)
                epoch_loss = running_loss / (
                            len(dataloaders[phase]) * batch_size)

                if phase == 'valid':
                    tracker.end_valid_iteration(epoch_loss, epoch_acc.item())
                    earlystop(epoch_loss, model)

                if phase == 'train':
                    tracker.end_train_iteration(epoch_loss, epoch_acc.item())

            if (earlystop.early_stop):
                print("Early stopping")
                break
            print('{} Accuracy: '.format(phase), epoch_acc.item())
            tracker.end_epoch()
Пример #2
0
    def train(self, training_data: TrainingData) -> None:
        x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(training_data, full_question=args.full_question, create_runs=args.create_runs)
        self.class_to_i = class_to_i
        self.i_to_class = i_to_class

        print('Batchifying data')
        train_batches = batchify(x_train, y_train, shuffle=True)
        val_batches = batchify(x_val, y_val, shuffle=False)
        self.model = ElmoModel(len(i_to_class), dropout=self.dropout)
        self.model = self.model.to(self.device)
        
        print(f'Model:\n{self.model}')
        parameters = list(self.model.classifier.parameters())
        for mix in self.model.elmo._scalar_mixes:
            parameters.extend(list(mix.parameters()))
        self.optimizer = Adam(parameters)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max')


        temp_prefix = get_tmp_filename()
        self.model_file = f'{temp_prefix}.pt'

        print(f'Saving model to: {self.model_file}')
        log = get(__name__)
        manager = TrainingManager([
            BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc')
        ])

        log.info('Starting training')

        epoch = 0
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_batches)
            random.shuffle(train_batches)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(val_batches, train=False)

            stop_training, reasons = manager.instruct(
                train_time, train_loss, train_acc,
                test_time, test_loss, test_acc
            )

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)
            epoch += 1
Пример #3
0
            'lr': args.lr,
            'dropout': args.dropout
        })

    save_dir = 'save_model'
    result_dir = save_dir + '/result/'
    ckpt_dir = save_dir + '/checkpoints/'
    makedirs(result_dir)
    makedirs(ckpt_dir)
    result_fn = result_dir + args.save + '.pkl'
    ckpt_fn = ckpt_dir + args.save + '.pt'

    # instantiate the early stopping object
    patience = 7
    early_stopping = EarlyStopping(ckpt_dir,
                                   args.save + '.pt',
                                   patience=patience,
                                   verbose=True)

    # Set the random seed manually for reproducibility.
    if args.cuda:
        if not torch.cuda.is_available():
            print("WARNING: No CUDA device detected, switch to cpu device!")
            device = torch.device('cpu')
            torch.manual_seed(args.seed)
        else:
            device = torch.device('cuda')
            torch.cuda.manual_seed(args.seed)
    else:
        if torch.cuda.is_available():
            print("WARNING: CUDA device detected, continue to use cpu device!")
            device = torch.device('cpu')
Пример #4
0
class NNModel(object):
    def __init__(self, sess, args):
 
		self.x = tf.placeholder(tf.float32, [None, args.feature_dim], name="input")
		self.y = tf.placeholder(tf.float32, [None, args.output_dim], name="output")
		self.y_output =  build_model(self.x, args.output_dim)
		self.y_prob = tf.nn.softmax(self.y_output, name='prob')
		self.y_pred = tf.arg_max(self.y_prob, 1, name='pred')
		self.correct_predictions = tf.equal(self.y_pred, tf.math.argmax(self.y, 1))
		self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy")
		self.loss_f =  tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.y_output, labels=self.y))

		self.x_1 = tf.placeholder(tf.float32, [None, args.feature_dim], name="input")
		self.y_1_output =  build_model(self.x_1, args.output_dim)
		self.loss_augment =  tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.y_1_output, labels=self.y))
		self.final_reg = tf.add(args.reg_param_1 * self.loss_y_y1, args.reg_param_2 * self.loss_augment, name="loss_final")

		self.loss_final = tf.add(self.final_reg, self.loss_f, name="loss_final")
		self.optimizer_final = tf.train.AdamOptimizer(args.lr, name = "opt2").minimize(loss_final) 
        

	 def restore(self, sess, output_dir, model_name):
      model_filename = "{}/models/{}.cpkt".format(output_dir,model_name)
      if os.path.isfile(model_filename):
            print('restoring model from %s...' % model_filename)
            self.saver.restore(sess, model_filename)         
    
    def save(self, sess, output_dir,model_name,overwrite=True):
        model_filename = "{}/models/{}.cpkt".format(output_dir,model_name)
        if not os.path.isfile(model_filename) or overwrite:
            print('saving model to %s...' % model_filename)
            self.saver.save(sess, model_filename)
	

	# train for the base model 
    def train_basemodel(self, sess, batch_x, batch_y, update_num):
        feed_dict={self.x: batch_x, self.y: batch_y}
        _ = sess.run(self.optimizer_f, feed_dict=feed_dict)


	def standard_augmentation(self, sess, batch_x, batch_x1, batch_y):
        batch_x1= am_util.augment_data_y1_modified(batch_x, S_30)
        feed_dict={self.x: batch_x, self.y: batch_y, self.x_1: batch_x1}
        _ = sess.run(self.optimizer_final, feed_dict=feed_dict)
            
       
    # evalute on the validation set, keep the summaries
    def val(self, sess, val_x, val_y):
        #loss_vector = []
        feed_dict = {self.x: val_x, self.y_: val_y}
        # get all metrics here
       
        loss  = sess.run(self.loss_f, feed_dict=feed_dict)
        #loss_vector.append(loss)
        print("validation loss = ", "{:.4f} | ".format(loss))
        return loss
    
    

    # test and save the best result   
    def test(self, sess, test_x, test_y, output_dir):
        feed_dict = {self.x: test_x, self.y_: test_y}
        loss, accuracy, pred_test_y  = sess.run([self.loss_f, self.accuracy, self.pred], feed_dict=feed_dict)
        print("test loss is =", "{:.4f} | ".format(loss), "test accuracy is =", "{:.4f} | ".format(accuracy))
        np.save(output_dir + '/pred_test_y.npy', pred_test_y)
        np.save(output_dir + '/test_y', test_y)
        
        with open(output_dir + '/Final_result.csv', 'a') as newFile:
            headers = ['epsilon', 'reg_param1', 'reg_param2', 'test_loss', 'test_accuracy']
            newFileWriter = csv.DictWriter(newFile, fieldnames=headers)
            newFileWriter.writeheader()
            newFileWriter.writerow({'epsilon': args.epsilon, 'reg_param1': args.reg_param1, 'reg_param2': args.reg_param2, 'test_loss': loss, 'test_accuracy': accuracy})
    
## 7. define the session
from util import EarlyStopping
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
sess = tf.Session(config = config)
# get the model
model = NNModel(sess, args)
# get the early stop
early_stop = EarlyStopping(model)
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init)

update_num = 0
is_early_stop = False
## base model training
for epoch in range(args.epochs):
    permutation = np.random.choice(train_num,train_num, replace=False)
    if is_early_stop == True:
            break
    for j in range(batch_num):
        update_num = update_num + 1
        batch_index = permutation[(j * args.batch_size): (j + 1) * args.batch_size]
        batch_x = train_x[batch_index, :]
        batch_y = train_y[batch_index, :]
        model.train_basemodel(sess, batch_x, batch_y, update_num,  model.train_summary_writer, args.display_step)
        val_loss = model.val(sess, val_x, val_y, update_num, model.val_summary_writer, args.display_step)
        
        ##Early stopping starts after updates
        if update_num > args.start_early_stop and update_num % args.display_step == 0:
            is_early_stop = early_stop(val_loss, args.stopping_criteria, args.model_name, update_num)
            print(is_early_stop)
            if is_early_stop:
                early_stop.restore(sess, args.model_name, update_num)
                #model.test(sess, test_x, test_y, output_dir)
                break
                      shuffle=True,
                      drop_last=True,
                      num_workers=4)
train_dl = DataLoader(dataset=train_ds,
                      batch_size=batch_size,
                      shuffle=True,
                      drop_last=True,
                      num_workers=4)

model = densenet161((1, 128, 128))
model.cuda()
criterion = BCEWithLogitsLoss().cuda()
optimizer = Adam(model.parameters(), lr=1e-3)
scheduler = MultiStepLR(optimizer, milestones=[30, 45, 60], gamma=0.1)
earlystopper = EarlyStopping(mode='max',
                             min_delta=0.0001,
                             patience=15,
                             percentage=False)
best_score = 0
model_state_path = os.path.join(
    Path.MODEL_DIR, 'densenet161_noisy_gpu_{}.tar'.format(args.gpu))

for epoch in range(MAX_ITERATIONS):
    model.train()
    for idx, ((x1, y1), (x2, y2)) in enumerate(zip(focus_dl, train_dl)):
        optimizer.zero_grad()
        x1, y1 = x1.float().cuda(), y1.float().cuda()
        x2, y2 = x2.float().cuda(), y2.float().cuda()
        x2, y2 = mixup_data(x2, y2, x2, y2, alpha=alpha)
        out2 = model(x2)
        out1 = model(x1)
        loss2 = criterion(out2, y2)
f_1 = f_2=  np.arange(28)
f_1, f_2 = np.meshgrid(f_1, f_2)
f_1 = f_1.reshape(784,1)
f_2 = f_2.reshape(784,1)
fsd = np.concatenate((f_2, f_1), axis = 1)
S = euclidean_distances(fsd)
S[S>8]=0


#1.load base model
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
sess = tf.Session(config = config)
# get the model
model = NNModel(sess,args)
# get the early stop
early_stop = EarlyStopping(model)
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init)

update_num = 0
i1s_early_stop = False
update_num = 0
args.batch_size = train_num
batch_num = int(train_num / args.batch_size)
is_early_stop = False
model.restore(sess, args.base_model_path , 'base_model')

##train the final model
model_name = 'final_model'
for epoch in range(args.epochs):
    permutation = np.random.choice(train_num,train_num, replace=False)
 def __init__(self, conf):
     self.conf = conf
     self.device = torch.device(f"cuda:{conf.gpu_id}")
     self.log = get_logger()
     torch.set_printoptions(precision=8)
     if conf.runid:
         conf.rundir = mkdir(conf.outdir / conf.runid)
     if not conf.rundir:
         conf.rundir = next_rundir(conf.outdir, log=self.log)
     self.rundir = conf.rundir
     dump_args(conf, conf.rundir / "conf.json")
     set_random_seed(conf.random_seed)
     if self.conf.use_bert:
         assert self.conf.lang in Bert.supported_langs, self.conf.lang
         self.bert = Bert(self.conf.bert_model_name, device=self.device)
     else:
         self.bert = None
     self.data = load_dataset(conf, conf.lang, bert=self.bert)
     _data = [self.data]
     for d in _data:
         self.log.info(
             f"{len(d.train_loader)} batches | bs {conf.batch_size}")
     self.model = self.get_model()
     self.optimizer = get_optim(conf, self.model)
     optimum = "min"
     if conf.lr_scheduler == "plateau":
         self.lr_scheduler = ReduceLROnPlateau(self.optimizer,
                                               factor=0.1,
                                               patience=2,
                                               mode=optimum,
                                               verbose=True)
     elif conf.lr_scheduler:
         raise ValueError("Unknown lr_scheduler: " + conf.lr_scheduler)
     self.losses = LossTrackers.from_names("loss", log=self.log)
     if (self.main_lang_data.tag == "ner"
             or self.conf.dataset.startswith("sr3de")):
         if self.data.is_multilingual:
             self.sentence_texts = {
                 split_name: self.main_lang_data.token_texts(split_name)
                 for split_name in ["dev", "test"]
             }
             self.conll_score = {
                 lang: ConllScore(tag_enc=self.main_lang_data.tag_enc)
                 for lang in self.data.dev
             }
             self.score = {
                 lang: Score("f1",
                             save_model=False,
                             log=self.log,
                             score_func=self.conll_score[lang],
                             add_mode="append")
                 for lang in self.data.dev
             }
             self.avg_score = Score("avg_f1",
                                    log=self.log,
                                    score_func="dummy",
                                    add_mode="append")
         else:
             self.sentence_texts = {
                 split_name: self.main_lang_data.token_texts(split_name)
                 [:conf.max_eval_inst]
                 for split_name in ["dev", "test"]
             }
             self.conll_score = ConllScore(
                 tag_enc=self.main_lang_data.tag_enc)
             self.score = Score("f1",
                                log=self.log,
                                score_func=self.conll_score,
                                add_mode="append")
     else:
         if self.data.is_multilingual:
             self.score = {
                 lang: Score("acc", log=self.log)
                 for lang in self.data.dev
             }
             self.avg_score = Score("avg_acc",
                                    log=self.log,
                                    score_func="dummy",
                                    add_mode="append")
         else:
             self.score = Score("acc", log=self.log)
     if conf.early_stop > 0:
         score_optimum = ("max" if
                          (self.conf.dataset.startswith("wikiannmulti")
                           or self.data.is_multilingual) else
                          self.score.optimum)
         self.early_stop = EarlyStopping(
             score_optimum,
             min_delta=conf.early_stop_min_delta,
             patience=conf.early_stop)
     else:
         self.early_stop = None
     self.epoch = 0
class Trainer():
    def __init__(self, conf):
        self.conf = conf
        self.device = torch.device(f"cuda:{conf.gpu_id}")
        self.log = get_logger()
        torch.set_printoptions(precision=8)
        if conf.runid:
            conf.rundir = mkdir(conf.outdir / conf.runid)
        if not conf.rundir:
            conf.rundir = next_rundir(conf.outdir, log=self.log)
        self.rundir = conf.rundir
        dump_args(conf, conf.rundir / "conf.json")
        set_random_seed(conf.random_seed)
        if self.conf.use_bert:
            assert self.conf.lang in Bert.supported_langs, self.conf.lang
            self.bert = Bert(self.conf.bert_model_name, device=self.device)
        else:
            self.bert = None
        self.data = load_dataset(conf, conf.lang, bert=self.bert)
        _data = [self.data]
        for d in _data:
            self.log.info(
                f"{len(d.train_loader)} batches | bs {conf.batch_size}")
        self.model = self.get_model()
        self.optimizer = get_optim(conf, self.model)
        optimum = "min"
        if conf.lr_scheduler == "plateau":
            self.lr_scheduler = ReduceLROnPlateau(self.optimizer,
                                                  factor=0.1,
                                                  patience=2,
                                                  mode=optimum,
                                                  verbose=True)
        elif conf.lr_scheduler:
            raise ValueError("Unknown lr_scheduler: " + conf.lr_scheduler)
        self.losses = LossTrackers.from_names("loss", log=self.log)
        if (self.main_lang_data.tag == "ner"
                or self.conf.dataset.startswith("sr3de")):
            if self.data.is_multilingual:
                self.sentence_texts = {
                    split_name: self.main_lang_data.token_texts(split_name)
                    for split_name in ["dev", "test"]
                }
                self.conll_score = {
                    lang: ConllScore(tag_enc=self.main_lang_data.tag_enc)
                    for lang in self.data.dev
                }
                self.score = {
                    lang: Score("f1",
                                save_model=False,
                                log=self.log,
                                score_func=self.conll_score[lang],
                                add_mode="append")
                    for lang in self.data.dev
                }
                self.avg_score = Score("avg_f1",
                                       log=self.log,
                                       score_func="dummy",
                                       add_mode="append")
            else:
                self.sentence_texts = {
                    split_name: self.main_lang_data.token_texts(split_name)
                    [:conf.max_eval_inst]
                    for split_name in ["dev", "test"]
                }
                self.conll_score = ConllScore(
                    tag_enc=self.main_lang_data.tag_enc)
                self.score = Score("f1",
                                   log=self.log,
                                   score_func=self.conll_score,
                                   add_mode="append")
        else:
            if self.data.is_multilingual:
                self.score = {
                    lang: Score("acc", log=self.log)
                    for lang in self.data.dev
                }
                self.avg_score = Score("avg_acc",
                                       log=self.log,
                                       score_func="dummy",
                                       add_mode="append")
            else:
                self.score = Score("acc", log=self.log)
        if conf.early_stop > 0:
            score_optimum = ("max" if
                             (self.conf.dataset.startswith("wikiannmulti")
                              or self.data.is_multilingual) else
                             self.score.optimum)
            self.early_stop = EarlyStopping(
                score_optimum,
                min_delta=conf.early_stop_min_delta,
                patience=conf.early_stop)
        else:
            self.early_stop = None
        self.epoch = 0

    def get_model(self):
        ntags = self.data.tag_enc.nlabels
        nshapes = self.data.shape_enc.nlabels
        nchars = self.data.char_enc.nlabels
        bpe_emb = emb_layer(self.data.bpemb.vectors,
                            trainable=not self.conf.emb_fixed,
                            use_weights=not self.conf.emb_random_init)
        if self.conf.use_fasttext:
            fasttext_file = self.conf.fasttext_emb_file.format(
                dataset=self.conf.dataset, lang=self.data.lang)
            fasttext_emb = emb_layer(load_word2vec_file(fasttext_file,
                                                        add_unk=True),
                                     trainable=not self.conf.emb_fixed,
                                     use_weights=not self.conf.emb_random_init)
        else:
            fasttext_emb = None
        model = SequenceTagger(
            bpe_emb,
            ntags,
            self.conf,
            nchars=nchars,
            nshapes=nshapes,
            fasttext_emb=fasttext_emb,
            bert=self.bert,
            tag_enc=self.main_lang_data.tag_enc,
        ).to(self.device)
        self.log.info(f'model repr dim: {model.repr_dim}')
        if self.conf.model_file:
            self.log.info(f"loading model {self.conf.model_file}")
            model.load_state_dict(torch.load(self.conf.model_file))
            self.log.info(f"loaded model {self.conf.model_file}")
        return model

    def train(self, train_epoch, do_eval, do_test=None, eval_ds_name=None):
        try:
            for epoch in range(1, self.conf.max_epochs + 1):
                self.epoch = epoch
                self.model.train()
                train_epoch(epoch=epoch)
                self.losses.interval_end_log(epoch, ds_name="train")
                burnin_done = epoch >= self.conf.first_eval_epoch
                if burnin_done and not epoch % self.conf.eval_every:
                    score = self.do_eval(do_eval,
                                         epoch=epoch,
                                         eval_ds_name=eval_ds_name)
                    if do_test:
                        self.do_eval(do_test, epoch=epoch, eval_ds_name="test")
                    if score is not None and self.early_stop:
                        if self.early_stop.step(score):
                            if epoch >= self.conf.min_epochs:
                                patience = self.early_stop.patience
                                self.log.info(
                                    f"Early stop after {patience} steps")
                                break
        except KeyboardInterrupt:
            self.log.info("Stopping training due to keyboard interrupt")

    def do_eval(self, eval_func, epoch=None, eval_ds_name=None):
        self.model.eval()
        eval_func(epoch=epoch)
        self.log_eval(ds_name=eval_ds_name, epoch=epoch)
        if self.data.is_multilingual:
            return self.avg_score.current
        return self.score.current

    def log_eval(self, ds_name=None, epoch=None):
        self.losses.interval_end(ds_name=ds_name)
        if self.data.is_multilingual:
            for lang in getattr(self.data, ds_name):
                if hasattr(self, "conll_score"):
                    self.conll_score[lang].sentences = \
                        self.sentence_texts[ds_name][lang]
                    fname = f"{epoch}.{ds_name}.{lang}.conll"
                    self.conll_score[lang].outfile = self.rundir / fname
                self.score[lang].update()
            avg_score = np.average(
                [score.current for score in self.score.values()])
            self.avg_score.update_log(model=self.model,
                                      rundir=self.rundir,
                                      epoch=epoch,
                                      score=avg_score)
        else:
            if hasattr(self, "conll_score"):
                self.conll_score.sentences = self.sentence_texts[ds_name]
                fname = f"{epoch}.{ds_name}.conll"
                self.conll_score.outfile = self.rundir / fname
            self.score.update_log(self.model, self.rundir, epoch)

    def save_model(self):
        model_file = self.rundir / f"model.e{self.epoch}.pt"
        save_model(self.model, model_file, self.log)

    @property
    def main_lang_data(self):
        return self.data[0] if isinstance(self.data, list) else self.data

    @property
    def batch_iter_train_multilang(self):
        main_lang_len = len(self.data[0].train_loader)
        max_sim_lang_len = int(self.conf.sim_lang_ratio * main_lang_len)

        def get_sim_lang_len(i):
            sim_lang_len = len(self.data[i].train_loader)
            return min(sim_lang_len, max_sim_lang_len)

        lang_idxs = [
            i for i, data in enumerate(self.data)
            for _ in range(main_lang_len if i == 0 else get_sim_lang_len(i))
        ]
        random.shuffle(lang_idxs)
        iters = [data.batch_iter_train for data in self.data]
        return ((i, next(iters[i])) for i in lang_idxs)
Пример #9
0
def hp_search_optuna(trial: optuna.Trial):

    global gargs
    args = gargs
    # set config
    config = load_config(args)
    config['args'] = args
    logger.info("%s", config)

    # set path
    set_path(config)

    # create accelerator
    accelerator = Accelerator()
    config['accelerator'] = accelerator
    args.device = accelerator.device

    # set search spaces
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    bsz = trial.suggest_categorical('batch_size', [8, 16, 32, 64])
    seed = trial.suggest_int('seed', 17, 42)
    epochs = trial.suggest_int('epochs', 1, args.epoch)

    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(config, hp_search_bsz=bsz)

    with temp_seed(seed):
        # prepare model
        model = prepare_model(config)

        # create optimizer, scheduler, summary writer
        model, optimizer, scheduler, writer = prepare_others(config,
                                                             model,
                                                             train_loader,
                                                             lr=lr)
        # create secondary optimizer, scheduler
        _, optimizer_2nd, scheduler_2nd, _ = prepare_others(
            config, model, train_loader, lr=args.bert_lr_during_freezing)
        train_loader = accelerator.prepare(train_loader)
        valid_loader = accelerator.prepare(valid_loader)

        config['optimizer'] = optimizer
        config['scheduler'] = scheduler
        config['optimizer_2nd'] = optimizer_2nd
        config['scheduler_2nd'] = scheduler_2nd
        config['writer'] = writer

        total_batch_size = args.batch_size * accelerator.num_processes * args.gradient_accumulation_steps
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_loader)}")
        logger.info(f"  Num Epochs = {args.epoch}")
        logger.info(
            f"  Instantaneous batch size per device = {args.batch_size}")
        logger.info(
            f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
        )
        logger.info(
            f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}"
        )
        logger.info(f"  Total optimization steps = {args.max_train_steps}")

        early_stopping = EarlyStopping(logger,
                                       patience=args.patience,
                                       measure='f1',
                                       verbose=1)
        best_eval_f1 = -float('inf')
        for epoch in range(epochs):
            eval_loss, eval_f1, best_eval_f1 = train_epoch(
                model, config, train_loader, valid_loader, epoch, best_eval_f1)

            # early stopping
            if early_stopping.validate(eval_f1, measure='f1'): break
            if eval_f1 == best_eval_f1:
                early_stopping.reset(best_eval_f1)
            early_stopping.status()

            trial.report(eval_f1, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()
        return eval_f1
Пример #10
0
def train(args):

    # set etc
    torch.autograd.set_detect_anomaly(False)

    # set config
    config = load_config(args)
    config['args'] = args
    logger.info("%s", config)

    # set path
    set_path(config)

    # create accelerator
    accelerator = Accelerator()
    config['accelerator'] = accelerator
    args.device = accelerator.device

    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(config)

    with temp_seed(args.seed):
        # prepare model
        model = prepare_model(config)

        # create optimizer, scheduler, summary writer
        model, optimizer, scheduler, writer = prepare_others(
            config, model, train_loader)
        # create secondary optimizer, scheduler
        _, optimizer_2nd, scheduler_2nd, _ = prepare_others(
            config, model, train_loader, lr=args.bert_lr_during_freezing)
        train_loader = accelerator.prepare(train_loader)
        valid_loader = accelerator.prepare(valid_loader)

        config['optimizer'] = optimizer
        config['scheduler'] = scheduler
        config['optimizer_2nd'] = optimizer_2nd
        config['scheduler_2nd'] = scheduler_2nd
        config['writer'] = writer

        total_batch_size = args.batch_size * accelerator.num_processes * args.gradient_accumulation_steps
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_loader)}")
        logger.info(f"  Num Epochs = {args.epoch}")
        logger.info(
            f"  Instantaneous batch size per device = {args.batch_size}")
        logger.info(
            f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
        )
        logger.info(
            f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}"
        )
        logger.info(f"  Total optimization steps = {args.max_train_steps}")

        # training
        early_stopping = EarlyStopping(logger,
                                       patience=args.patience,
                                       measure='f1',
                                       verbose=1)
        local_worse_epoch = 0
        best_eval_f1 = -float('inf')
        for epoch_i in range(args.epoch):
            epoch_st_time = time.time()
            eval_loss, eval_f1, best_eval_f1 = train_epoch(
                model, config, train_loader, valid_loader, epoch_i,
                best_eval_f1)
            # early stopping
            if early_stopping.validate(eval_f1, measure='f1'): break
            if eval_f1 == best_eval_f1:
                early_stopping.reset(best_eval_f1)
            early_stopping.status()
Пример #11
0
)

eval_loader = data.DataLoader(
    dataset=DataFolder('dataset/eval_images_256/', 'dataset/eval_masks_256/', 'evaluate'),
    batch_size=args.eval_batch_size,
    shuffle=False,
    num_workers=2
)

model = UNet(1, shrink=1).cuda()
nets = [model]
params = [{'params': net.parameters()} for net in nets]
solver = optim.Adam(params, lr=args.lr)

criterion = nn.CrossEntropyLoss()
es = EarlyStopping(min_delta=args.min_delta, patience=args.patience)

for epoch in range(1, args.epochs+1):

    train_loss = []
    valid_loss = []

    for batch_idx, (img, mask, _) in enumerate(train_loader):

        solver.zero_grad()

        img = img.cuda()
        mask = mask.cuda()

        pred = model(img)
        loss = criterion(pred, mask)
Пример #12
0
# 查看可训练参数
print("Trainable Parameter lists:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)


loss_func=nn.BCELoss()
learning_rate = 2e-3
# learning_rate = 1e-5

optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

# scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[20, 50], gamma=0.1)
early_stopping = EarlyStopping(patience=10, verbose=False, path=model_save_path)

num_epoches = 100
print_every=40
k_value=5


print("Starting Training...")
for epoch in range(num_epoches):
    # scheduler.step()
    total = 0
    count = 0
    avg_loss = 0
    train_count = 0
    train_total = 0
    step = 0
Пример #13
0
                                         patience=5,
                                         verbose=1,
                                         epsilon=1e-8,
                                         cooldown=0,
                                         min_lr=0,
                                         eps=1e-8)
    train_monitor = TrainingMonitor(file_dir='pics', arch=param.model_name)

    if param.task == 'cue':
        model_checkpoint = ModelCheckpoint(
            checkpoint_dir=f'D:/Dev/Bert/{param.model_name}',
            monitor='val_cue_f1',
            mode='max',
            arch=param.model_name)
        early_stopping = EarlyStopping(patience=param.early_stop_thres,
                                       monitor='val_cue_f1',
                                       mode='max')
        trainer = CueTrainer(n_gpu=1,
                             model=model,
                             logger=global_logger,
                             optimizer=optimizer,
                             lr_scheduler=lr_scheduler,
                             label2id=cue_label2id,
                             criterion=criterion,
                             training_monitor=train_monitor,
                             model_checkpoint=model_checkpoint,
                             resume_path=None,
                             grad_clip=5.0,
                             gradient_accumulation_steps=1,
                             early_stopping=early_stopping)
    elif param.task == 'scope':
Пример #14
0
eval_set = ElisaDataset('elisadata/standard', 'EVALUATE')
eval_loader = data.DataLoader(dataset=eval_set,
                              batch_size=args.eval_batch_size,
                              shuffle=False,
                              num_workers=0)

elisa_net = network.ElisaNet(args.c_feat).cuda()

params = [{'params': elisa_net.parameters()}]
solver = optim.Adam(params, lr=args.lr)

lmda = lambda x: 0.5  # TODO: can change this based on bad_epochs
scheduler = LS.MultiplicativeLR(solver, lr_lambda=lmda)

es = EarlyStopping(mode=args.es_mode,
                   min_delta=args.loss_delta,
                   patience=args.patience)

epoch = 0

if args.resume_epoch != 0:
    load_weights([elisa_net], solver, args.resume_epoch, args)
    epoch = args.resume_epoch
    solver = lr_resume(solver, args.lr_resume)
    print('Loaded weights from epoch {}'.format(args.resume_epoch))

while epoch < args.epochs and not args.eval:
    epoch += 1

    train_loss, _ = forward_pass(train_loader, elisa_net, solver, scheduler,
                                 'TRAIN', epoch, args)
Пример #15
0
    def train(self, training_data: TrainingData) -> None:
        x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(training_data, full_question=args.full_question,\
         create_runs=args.create_runs, map_pattern=args.map_pattern, wiki_links=args.wiki_links, use_es_highlight=args.use_es_highlight)
        self.class_to_i = class_to_i
        self.i_to_class = i_to_class

        self.map_pattern = args.map_pattern
        self.wiki_links = args.wiki_links
        self.use_es_highlight = args.use_es_highlight
        self.full_question = args.full_question
        self.use_wiki = args.use_wiki

        log = get(__name__, "dan.log")
        log.info('Batchifying data')
        vocab = ['<unk>', '<eos>'] + sorted(vocab)
        word_to_i = {x: i for i, x in enumerate(vocab)}
        self.word_to_i = word_to_i
        log.info('Vocab len: ' + str(len(self.word_to_i)))

        train_sampler = RandomSampler(list(zip(x_train, y_train)))
        dev_sampler = RandomSampler(list(zip(x_val, y_val)))
        dev_loader = DataLoader(list(zip(x_val, y_val)),
                                batch_size=args.batch_size,
                                sampler=dev_sampler,
                                num_workers=0,
                                collate_fn=self.batchify)
        train_loader = DataLoader(list(zip(x_train, y_train)),
                                  batch_size=args.batch_size,
                                  sampler=train_sampler,
                                  num_workers=0,
                                  collate_fn=self.batchify)

        self.model = DanModel(len(i_to_class), len(vocab))
        self.model = self.model.to(self.device)

        log.info(f'Loading GloVe')
        glove_word2idx, glove_vectors = load_glove("glove/glove.6B.300d.txt")
        for word, emb_index in word_to_i.items():
            if word.lower() in glove_word2idx:
                glove_index = glove_word2idx[word.lower()]
                glove_vec = torch.FloatTensor(glove_vectors[glove_index])
                glove_vec = glove_vec.cuda()
                self.model.text_embeddings.weight.data[emb_index, :].set_(
                    glove_vec)

        log.info(f'Model:\n{self.model}')
        self.optimizer = Adam(self.model.parameters())
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                        patience=5,
                                                        verbose=True,
                                                        mode='max')

        temp_prefix = get_tmp_filename()
        self.model_file = f'{temp_prefix}.pt'

        print(f'Saving model to: {self.model_file}')
        log = get(__name__)
        manager = TrainingManager([
            BaseLogger(log_func=log.info),
            TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model),
                            self.model_file,
                            monitor='test_acc')
        ])

        log.info('Starting training')

        epoch = 0
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_loader)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(dev_loader,
                                                            train=False)

            stop_training, reasons = manager.instruct(train_time, train_loss,
                                                      train_acc, test_time,
                                                      test_loss, test_acc)

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)
            epoch += 1