def a2c_callback(locals, globals): global max_mean_reward, last_filename #pprint.pprint(locals) if ('mean_100ep_reward_a2c' in locals and locals['num_episodes'] >= 10 and locals['mean_100ep_reward_a2c'] > max_mean_reward): print("mean_100ep_reward_a2c : %s max_mean_reward : %s" % (locals['mean_100ep_reward_a2c'], max_mean_reward)) max_mean_reward = locals['mean_100ep_reward_a2c'] nsml.save(locals['mean_100ep_reward_a2c'])
bind_model(model3) nsml.load(checkpoint='62', session='KHD032/Breast_Pathology/223') alpha = 1.2 input_ = tf.keras.Input(shape=(299, 299, 3)) m1 = model1(input_) m2 = model2(input_) m3 = model3(input_) m3_out = tf.keras.layers.concatenate([1 - m3, m3 + alpha]) #m3_out = tf.keras.layers.concatenate([1 - m3, m3]) out = tf.keras.layers.add([m1, m2, m3_out]) #out = tf.keras.layers.add([m1, m2, m3]) model = tf.keras.models.Model(inputs=input_, outputs=out) bind_model(model) nsml.save('ensemble3') if config.pause: ## test mode 일때는 여기만 접근 print('Inferring Start...') nsml.paused(scope=locals()) ####################################### if config.mode == 'train': ### training mode 일때는 여기만 접근 print('Training Start...') ############ DONOTCHANGE: Path loader ############### root_path = os.path.join(DATASET_PATH, 'train') image_keys, image_path = path_loader(root_path) labels = label_loader(root_path, image_keys) ############################################## #input_ = tf.keras.Input(shape=(32, 32, 3))
saveiter = 0 total_loss = 0 iter = 0 epochPerbatchnum = len(y_train) // config.batch_size + 1 totalbatchnum = epochPerbatchnum * config.num_epochs minloss = 10000 minloss_step = 0 for batch in batches: iter += 1 x_batch, y_batch = zip(*batch) loss = train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) total_loss += loss # DONOTCHANGE (You can decide how often you want to save the model) if current_step % config.checkpoint_every == 0: nsml.save(saveiter) devtime = time.time() dev_step(x_dev, y_dev, writer=None) print('{}'.format('#' * 30)) print('epoch num = {} / {}'.format(iter // epochPerbatchnum, config.num_epochs)) print('saveiter {}\tbatch = {}/{} train loss = {:.4f}'.format( saveiter, iter, totalbatchnum, total_loss / iter)) print('validattion time : {:.2f}\tval_data length : {}'.format( time.time() - devtime, len(y_dev))) print('{}'.format('#' * 30)) print('saveiter %d: loss = %.4f' % (saveiter, total_loss / iter)) saveiter += 1
type=str, default="False,False") parser.add_argument('--loss_types', type=str, default="cross_entropy,cross_entropy,cross_entropy") parser.add_argument('--nsml_checkpoints', type=str, default="4,5,6_1181") parser.add_argument('--nsml_sessionss', type=str, default="99,385,408") # team_13/airush1/ parser.add_argument('--model_weights', type=str, default="0.34,0.33,0.33") parser.add_argument('--models', type=str, default="Resnet152,Resnet152,Resnet152" ) # Resnet18, Resnet152, efficientnet-b7, baseline args = parser.parse_args() print(args) torch.manual_seed(args.seed) device = args.device model = Resnet18(args.output_size) model = model.to(device) # DONOTCHANGE: They are reserved for nsml bind_model(model, args) if args.pause: nsml.paused(scope=locals()) if args.mode == "train": model.train() nsml.save("ensemble_session")
nsml.bind(save=save, load=load, infer=infer) if __name__ == '__main__': args = argparse.ArgumentParser() # hyperparameters args.add_argument('--epochs', type=int, default=100) args.add_argument('--batch_size', type=int, default=15) # DONOTCHANGE: They are reserved for nsml args.add_argument('--mode', type=str, default='train', help='submit일때 해당값이 test로 설정됩니다.') args.add_argument('--iteration', type=str, default='0', help='') args.add_argument('--pause', type=int, default=0, help='model 을 load 할때 1로 설정됩니다.') config = args.parse_args() model = model_fn() bind_model(model) if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': nsml.save(0)
def load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, val_or_test="val", is_pretrain=False, qa_style=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() cached_features_file = "cache_{}".format("dev" if evaluate else "train") # confirm mixing should be applied do_mix = (args.mix_qa and not evaluate) and (is_pretrain and val_or_test == "val") # load from cache if it is possible if val_or_test=="val" and args.load_cache: cached_session = args.cached_session_dev if evaluate else args.cached_session_train if is_pretrain: cached_session = args.cached_session_pretrain if qa_style: cached_session = args.cached_session_pretrain_qa logger.info("Loading features from cached file %s in %s", cached_features_file, cached_session) features_and_datasets = {} def load_data(dir_name): tmp = torch.load(os.path.join(dir_name, '{}.pt'.format(cached_features_file))) print(tmp.keys()) nsml.copy(tmp, features_and_datasets) nsml.bind(load=load_data) nsml.load(checkpoint=cached_features_file, session=cached_session) bind_nsml(model, tokenizer, args) print(features_and_datasets.keys()) features, dataset, examples = ( features_and_datasets["features"], features_and_datasets["dataset"], features_and_datasets["examples"], ) else: logger.info("Creating features from dataset file at %s", cached_features_file) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json" examples = processor.get_eval_examples(args.data_dir, filename=filename) else: if is_pretrain: examples = processor.get_pretrain_examples(args.data_dir, filename=args.train_file, qa_style=qa_style) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) # apply mixing if do_mix: num_qa = len(examples) mix_batch_size = int(args.mix_portion * num_qa) if mix_batch_size % 2 == 1: mix_batch_size -= 1 mix_batch = np.array(random.sample(range(num_qa), mix_batch_size)).reshape(-1, 2) for i, (k,v) in enumerate(mix_batch): example_k, example_v = examples[k], examples[v] ans_k, ans_v = example_k.answer_text, example_v.answer_text example_k.context_text, example_v.context_text = example_v.context_text, example_k.context_text assert not (example_k.is_impossible or example_v.is_impossible) if ans_k != ans_v: example_k.is_impossible, example_v.is_impossible = True, True example_k.start_position_character, example_v.start_position_character = None, None else: example_k.start_position, example_v.end_position = example_v.start_position, example_k.end_position if do_mix or not (val_or_test=="val" and args.load_cache): print("Starting squad_convert_examples_to_features") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) print("Complete squad_convert_examples_to_features") # make cache in the session if it is required if val_or_test=="val" and args.save_cache: features_and_datasets = {"dataset": dataset, "examples": examples, "features": features} def save_data(dir_name): os.makedirs(dir_name, exist_ok=True) torch.save(features_and_datasets, os.path.join(dir_name, '{}.pt'.format(cached_features_file))) logger.info("Save data at {}".format(dir_name)) nsml.bind(save=save_data) nsml.save(cached_features_file) bind_nsml(model, tokenizer, args) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
# ... # return the results return randint(0, 9) def evaluate(): pass def decode(input): print("decode:", input) # function in function is just used to divide the namespace. nsml.bind(save, load, infer, evaluate, decode) # Need to remember some state nsml.save(10) #################### # nsml leaderboard # ======================= # # nsml provides a handy leaderboard functionality. This example shows how to use # it. import os import numpy as np # 1. Push data # $ nsml dataset push diabetes data/ # 2. Run this # $ nsml run 07-data_read.py -i -d diabetes
def on_epoch_end(self, epoch, logs=None): # logs is a dictionary print(f"epoch: {epoch}, train_acc: {logs['acc']}") nsml.save(str(epoch))
def mixed_train(model, criterion, optimizer, scheduler, train_loader, val_loader, device): print('Train: Fine Tune') model.train() # Fine Tuning Step - Train train_loss = 0 val_loss = 0 train_acc = 0 val_acc = 0 i = 0 epoch = 0 for data, label in train_loader: i += 1 data, label = data.to(device), label.to(device) optimizer.zero_grad() output = model(data) loss = criterion(output, label) loss.sum().backward() optimizer.step() if isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingLR): scheduler.step() train_loss += loss.item() * data.size(0) _, pred = torch.max(output, dim=1) correct_tensor = pred.eq(label.data.view_as(pred)) accuracy = torch.mean(correct_tensor.type(torch.FloatTensor)) train_acc += accuracy.item() * data.size(0) if i % 10 == 0: print('Epoch: {}\t{:.2f}% complete.'.format( epoch, 100 * (i + 1) / len(train_loader))) # Fine Tuning Step - Validation with torch.no_grad(): model.eval() preds = [] trues = [] for data, label in val_loader: data, label = data.to(device), label.to(device) output = model(data) loss = criterion(output, label) val_loss += loss.item() * data.size(0) _, pred = torch.max(output, dim=1) correct_tensor = pred.eq(label.data.view_as(pred)) accuracy = torch.mean(correct_tensor.type(torch.FloatTensor)) val_acc += accuracy.item() * data.size(0) preds.append(pred.detach().cpu()) trues.append(label.detach().cpu()) preds = np.concatenate(preds) trues = np.concatenate(trues) cr = classification_report( trues, preds, labels=[0, 1, 2, 3], target_names=['normal', 'monotone', 'screenshot', 'unknown'], output_dict=True, zero_division=0) val_abnormal_f1_score = (cr['monotone']['f1-score'] * cr['screenshot']['f1-score'] * cr['unknown']['f1-score'])**(1 / 3) train_loss = train_loss / len(train_loader.dataset) val_loss = val_loss / len(val_loader.dataset) train_acc = train_acc / len(train_loader.dataset) val_acc = val_acc / len(val_loader.dataset) nsml.report(summary=True, scope=locals(), step=epoch, normal=cr['normal']['f1-score'], monotone=cr['monotone']['f1-score'], screenshot=cr['screenshot']['f1-score'], unknown=cr['unknown']['f1-score'], abnormal=val_abnormal_f1_score) print('\nEpoch: {}\tTraining Loss: {:.4f}\tValidation Loss: {:.4f}'. format(epoch, train_loss, val_loss)) print('\t\tTraining Accuracy: {:.2f}%\tValidation Accuracy: {:.2f}%'. format(100 * train_acc, 100 * val_acc)) print('\t\tClassification Report:{}'.format(cr)) print('\t\tF1 Score for Abnormal Class: {}'.format( val_abnormal_f1_score)) nsml.save(str('finetune')) print('Train: Full Tune') # Full Tuning Step - train for p in model.parameters(): required_grad = True if isinstance(model, torchvision.models.Inception3): model.aux_logits = True model.train() train_loss = 0 val_loss = 0 train_acc = 0 val_acc = 0 i = 0 epoch = 1 for data, label in train_loader: i += 1 data, label = data.to(device), label.to(device) optimizer.zero_grad() output = model(data) loss = criterion(output, label) loss.sum().backward() optimizer.step() if isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingLR): scheduler.step() train_loss += loss.item() * data.size(0) _, pred = torch.max(output, dim=1) correct_tensor = pred.eq(label.data.view_as(pred)) accuracy = torch.mean(correct_tensor.type(torch.FloatTensor)) train_acc += accuracy.item() * data.size(0) if i % 10 == 0: print('Epoch: {}\t{:.2f}% complete.'.format( epoch, 100 * (i + 1) / len(train_loader))) # Full Tuning Step - Validation with torch.no_grad(): model.eval() preds = [] trues = [] for data, label in val_loader: data, label = data.to(device), label.to(device) output = model(data) loss = criterion(output, label) val_loss += loss.item() * data.size(0) _, pred = torch.max(output, dim=1) correct_tensor = pred.eq(label.data.view_as(pred)) accuracy = torch.mean(correct_tensor.type(torch.FloatTensor)) val_acc += accuracy.item() * data.size(0) preds.append(pred.detach().cpu()) trues.append(label.detach().cpu()) preds = np.concatenate(preds) trues = np.concatenate(trues) cr = classification_report( trues, preds, labels=[0, 1, 2, 3], target_names=['normal', 'monotone', 'screenshot', 'unknown'], output_dict=True, zero_division=0) val_abnormal_f1_score = (cr['monotone']['f1-score'] * cr['screenshot']['f1-score'] * cr['unknown']['f1-score'])**(1 / 3) train_loss = train_loss / len(train_loader.dataset) val_loss = val_loss / len(val_loader.dataset) train_acc = train_acc / len(train_loader.dataset) val_acc = val_acc / len(val_loader.dataset) nsml.report(summary=True, scope=locals(), step=epoch, normal=cr['normal']['f1-score'], monotone=cr['monotone']['f1-score'], screenshot=cr['screenshot']['f1-score'], unknown=cr['unknown']['f1-score'], abnormal=val_abnormal_f1_score) print('\nEpoch: {}\tTraining Loss: {:.4f}\tValidation Loss: {:.4f}'. format(epoch, train_loss, val_loss)) print('\t\tTraining Accuracy: {:.2f}%\tValidation Accuracy: {:.2f}%'. format(100 * train_acc, 100 * val_acc)) print('\t\tClassification Report:{}'.format(cr)) print('\t\tF1 Score for Abnormal Class: {}'.format( val_abnormal_f1_score)) nsml.save(str('fulltune')) return
def fit(self, epochs_finetune, epochs_full, batch_size, debug=False): self.debug = debug self.data.prepare() #self.network = multi_gpu_model(self.network, gpus=2) self.network.compile( loss=self.loss(), optimizer=self.optimizer('finetune'), metrics=self.fit_metrics() ) steps_per_epoch_train = int(self.data.len('train') / batch_size) if not self.debug else 2 model_path_finetune = 'model_finetuned.h5' train_gen, val_gen = self.data.train_val_gen(batch_size) nsml.save(checkpoint='best') #class_weights = class_weight.compute_class_weight( # 'balanced', # np.unique(train_gen.classes), # train_gen.classes) #class_weights = [1, 95.4, 48.24, 13.46] self.network.fit_generator(generator=train_gen, steps_per_epoch=steps_per_epoch_train, epochs=epochs_finetune, callbacks=self.callbacks( model_path=model_path_finetune, model_prefix='last_layer_tuning', patience=5, val_gen=val_gen, classes=self.data.classes), validation_data=val_gen, use_multiprocessing=True, workers=20, #class_weight=class_weights ) # TODO change to be dependent on n_cpus self.network.load_weights(model_path_finetune) self.unfreeze() self.network.compile( loss=self.loss(), optimizer=self.optimizer('full'), metrics=self.fit_metrics() ) model_path_full = 'model_full.h5' self.network.fit_generator(generator=train_gen, steps_per_epoch=steps_per_epoch_train, epochs=epochs_full, callbacks=self.callbacks( model_path=model_path_full, model_prefix='full_tuning', val_gen=val_gen, patience=10, classes=self.data.classes), validation_data=val_gen, use_multiprocessing=True, workers=20, #class_weight=class_weights ) self.network.load_weights(model_path_full) nsml.save(checkpoint='best') print('Done') self.metrics(gen=val_gen)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=WORD_MAXLEN, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--word', action='store_true', help='Train/Predict model using word based label (default: False)') parser.add_argument('--gen_label_index', action='store_true', help='Generate word label index map(default: False)') parser.add_argument('--iteration', type=str, help='Iteratiom') parser.add_argument('--premodel_session', type=str, help='Session name of premodel') # transformer model parameter parser.add_argument('--d_model', type=int, default=128, help='transformer_d_model') parser.add_argument('--n_head', type=int, default=8, help='transformer_n_head') parser.add_argument('--num_encoder_layers', type=int, default=4, help='num_encoder_layers') parser.add_argument('--num_decoder_layers', type=int, default=4, help='transformer_num_decoder_layers') parser.add_argument('--dim_feedforward', type=int, default=2048, help='transformer_d_model') parser.add_argument('--dropout', type=float, default=0.1, help='transformer_dropout') # transformer warmup parameter parser.add_argument('--warmup_multiplier', type=int, default=3, help='transformer_warmup_multiplier') parser.add_argument('--warmup_epoch', type=int, default=10, help='transformer_warmup_epoch') args = parser.parse_args() char_loader = CharLabelLoader() char_loader.load_char2index('./hackathon.labels') label_loader = char_loader if args.word: if args.gen_label_index: generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH) from subprocess import call call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True) # ??? ??? ??? ?? word_loader = CharLabelLoader() word_loader.load_char2index('./hackathon.pos.labels') label_loader = word_loader if os.path.exists(TRAIN_LABEL_CHAR_PATH): generate_word_label_file(char_loader, word_loader, TRAIN_LABEL_POS_PATH, TRAIN_LABEL_CHAR_PATH) char2index = label_loader.char2index index2char = label_loader.index2char SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') ############ model print("model: transformer") # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers, # dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN) encoder = Encoder(d_input=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, pe_maxlen=SOUND_MAXLEN) decoder = Decoder(sos_id=SOS_token, eos_id=EOS_token, n_tgt_vocab=len(char2index), d_word_vec=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, tgt_emb_prj_weight_sharing=True, pe_maxlen=SOUND_MAXLEN) model = Transformer(encoder, decoder) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), lr=0.0004, betas=(0.9, 0.98), eps=1e-09)) ############/ for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) """ optimizer = optim.Adam(model.module.parameters(), lr=args.lr) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) """ bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o # target_path = os.path.join(DATASET_PATH, 'train_label') target_path = TRAIN_LABEL_CHAR_PATH if args.word: target_path = TRAIN_LABEL_POS_PATH load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) if args.iteration: if args.premodel_session: nsml.load(args.iteration, session=args.premodel_session) logger.info(f'Load {args.premodel_session} {args.iteration}') else: nsml.load(args.iteration) logger.info(f'Load {args.iteration}') logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): # learning rate scheduler train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() print("~~~~~~~~~~~~") if epoch == 10 or (epoch > 48 and epoch % 10 == 9): valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, device, args.max_len, args.batch_size) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
# Train for ep in range(EP): p_Y = model(tr_X / mean10_val) loss = 0.3 * crit(p_Y[:, 0], tr_Y[:, 0]) + 0.7 * crit( p_Y[:, 1], tr_Y[:, 1]) opt.zero_grad() loss.backward() opt.step() #te_p_Y = model(data.te_X) #np_val = te_p_Y.detach().numpy().reshape(len(te_p_Y)) #te_p_Y = Variable(torch.Tensor(np.where(np_val < 0, 0, np_val))) #te_loss = crit(te_p_Y, data.te_Y) if ep % 10 == 0: nsml.save(ep) print(ep, np.sqrt( loss.data.item())) #, #np.sqrt(te_loss.data.item())) nsml.save(ep) # Save #epoch = 1 #nsml.save(epoch) # If you are using neural networks, you may want to use epoch as checkpoints # Load test (Check if load method works well) #nsml.load(epoch) # Infer test #for file in train_data_files[:10]: # data = np.load(file)
def model_Fit(model, Modelname): t0 = time.time() for e in range(nb_epoch): t1 = time.time() print('') print(Modelname + ' Epochs : ', e + 1) '''epoch에 맞게 x_rain,y_train가져오기 ''' x_train, y_train = balancing_process(train_dataset_path, input_shape, st_epoch, e) # train_datagen.fit(x_train) train_generator = train_datagen.flow( x_train, y_train, batch_size=batch_size, shuffle=True, ) # 새로 데이터 넣어줄때마다 mean std 설정 train_datagen.mean = mean train_datagen.std = std STEP_SIZE_TRAIN = train_generator.n // train_generator.batch_size res = model.fit_generator( generator=train_generator, steps_per_epoch=STEP_SIZE_TRAIN, initial_epoch=e, epochs=e + 1, callbacks=[reduce_lr], verbose=1, shuffle=True, ) t2 = time.time() print(res.history) print('Training time for one epoch : %.1f' % ((t2 - t1))) train_loss, train_acc = res.history['loss'][0], res.history['acc'][0] #val_loss, val_acc = res.history['val_loss'][0], res.history['val_acc'][0] nsml.report(summary=True, epoch=e, epoch_total=nb_epoch, loss=train_loss, acc=train_acc) #, val_loss=val_loss, val_acc=val_acc) if Modelname == 'DenseNet121': print('Model generated : ' + Modelname + "_" + str(e)) if (e) % 20 == 0: CkptName = 'DtNet121' + '_' + str(e) nsml.save(CkptName) print('checkpoint name : ' + str(CkptName)) if e > 60 and e < 100: CkptName = 'DtNet121' + '_' + str(e) nsml.save(str(CkptName)) print('checkpoint name : ' + str(CkptName)) elif Modelname == 'DenseNet169': print('Model generated : ' + Modelname + "_" + str(e)) if (e) % 20 == 0: CkptName = 'DtNet169' + '_' + str(e) nsml.save(CkptName) print('checkpoint name : ' + str(CkptName)) if e > 60 and e < 100: CkptName = 'DtNet169' + '_' + str(e) nsml.save(str(CkptName)) print('checkpoint name : ' + str(CkptName)) elif Modelname == 'DenseNet201': print('Model generated : ' + Modelname + "_" + str(e)) if (e) % 20 == 0: CkptName = 'DtNet201' + '_' + str(e) nsml.save(CkptName) print('checkpoint name : ' + str(CkptName)) if e > 60 and e < 100: CkptName = 'DtNet201' + '_' + str(e) nsml.save(str(CkptName)) print('checkpoint name : ' + str(CkptName)) else: NotImplementedError # 메모리 해제 del x_train del y_train gc.collect() print('Total training time : %.1f' % (time.time() - t0)) return model, res
# if C.get()['infer_mode'] == 'face': # targets_only = [] # lbs = CustomDataset(TRAIN_DATASET_PATH).targets # for lb_id in range(num_classes): # if lbs.count(lb_id) > 150: # continue # targets_only.append(lb_id) # print(targets_only) if config.transfer: # nsml.load(checkpoint='transfer', session='team_286/4_cls_food/89') nsml.load(checkpoint='100', session='team_286/4_cls_food/103') # cv=1 cutmix 0.5 # nsml.load(checkpoint='55', session='team_286/7_icls_face/2') # nsml.load(checkpoint='transfer', session='team_286/8_iret_food/12') # nsml.load(checkpoint='20', session='team_286/9_iret_car/16') nsml.save('resave') sys.exit(0) tr_loader, val_loader, val_label = data_loader_with_split(root=TRAIN_DATASET_PATH, cv_ratio=config.ratio, cv=config.cv, batch_size=C.get()['batch']) time_ = datetime.datetime.now() best_val_top1 = 0 dataiter = iter(tr_loader) num_steps = 100000 // C.get()['batch'] from pystopwatch2 import PyStopwatch for epoch in range(C.get()['epochs']): w = PyStopwatch() metrics = Accumulator() scheduler.step()
vocab_clean = set() ratio = 0.999 accu_cnt = 0 for char, cnt in sorted(noisy_counter.items(), key=lambda item: item[1], reverse=True): vocab_noisy.add(char) accu_cnt += cnt if accu_cnt / noisy_total >= ratio: break accu_cnt = 0 for char, cnt in sorted(unlabeled_counter.items(), key=lambda item: item[1], reverse=True): vocab_unlabeled.add(char) accu_cnt += cnt if accu_cnt / unlabeled_total >= ratio: break accu_cnt = 0 for char, cnt in sorted(clean_counter.items(), key=lambda item: item[1], reverse=True): vocab_clean.add(char) accu_cnt += cnt if accu_cnt / clean_total >= ratio: break vocab_total = vocab_noisy.union(vocab_unlabeled).union(vocab_clean) vocab_total = sorted(list(vocab_total), key=lambda x: -total_counter[x]) print(vocab_total) bind_nsml(vocab_noisy, vocab_unlabeled, vocab_clean, vocab_total) nsml.save('vocab')
model = EfficientNet.from_name(model_name) #model = EfficientNet.from_pretrained(model_name, num_classes=350) #summary(model,input_size=(3,224,224)) else: model = Baseline(args.hidden_size, args.output_size) #optimizer = optim.Adam(model.parameters(), args.learning_rate) #lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1,verbose=True) #criterion = nn.CrossEntropyLoss() #multi-class classification task #model = model.to(device) #model.train() # DONOTCHANGE: They are reserved for nsml bind_model(model) # below the nsml load #nsml.load(checkpoint='12',session='team_62/airush1/184') #nsml.save('T') if args.mode == "train": #nsml.load(checkpoint='12',session='team_62/airush1/184') nsml.save('E') print('---end---') if args.pause: nsml.paused(scope=locals())
def train(model, criterion, optimizer, scheduler, train_loader, val_loader, early_stop, n_epochs, print_every, device): val_loss_min = np.inf val_abnormal_f1_score_max = -np.inf stop_count = 0 val_max_acc = -np.inf model.epochs = 0 for epoch in range(n_epochs): train_loss = 0 val_loss = 0 train_acc = 0 val_acc = 0 model.train() ii = 0 for data, label in train_loader: ii += 1 data, label = data.to(device), label.to(device) optimizer.zero_grad() output = model(data) loss = criterion(output, label) loss.sum().backward() optimizer.step() if isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingLR): scheduler.step() train_loss += loss.item() * data.size(0) _, pred = torch.max(output, dim=1) correct_tensor = pred.eq(label.data.view_as(pred)) accuracy = torch.mean(correct_tensor.type(torch.FloatTensor)) train_acc += accuracy.item() * data.size(0) if ii % 10 == 0: print('Epoch: {}\t{:.2f}% complete.'.format( epoch, 100 * (ii + 1) / len(train_loader))) model.epochs += 1 with torch.no_grad(): model.eval() preds = [] trues = [] for data, label in val_loader: data, label = data.to(device), label.to(device) output = model(data) loss = criterion(output, label) val_loss += loss.item() * data.size(0) _, pred = torch.max(output, dim=1) correct_tensor = pred.eq(label.data.view_as(pred)) accuracy = torch.mean(correct_tensor.type(torch.FloatTensor)) val_acc += accuracy.item() * data.size(0) preds.append(pred.detach().cpu()) trues.append(label.detach().cpu()) preds = np.concatenate(preds) trues = np.concatenate(trues) cr = classification_report( trues, preds, labels=[0, 1, 2, 3], target_names=['normal', 'monotone', 'screenshot', 'unknown'], output_dict=True, zero_division=0) val_abnormal_f1_score = (cr['monotone']['f1-score'] * cr['screenshot']['f1-score'] * cr['unknown']['f1-score'])**(1 / 3) train_loss = train_loss / len(train_loader.dataset) val_loss = val_loss / len(val_loader.dataset) train_acc = train_acc / len(train_loader.dataset) val_acc = val_acc / len(val_loader.dataset) nsml.report(summary=True, scope=locals(), step=epoch, normal=cr['normal']['f1-score'], monotone=cr['monotone']['f1-score'], screenshot=cr['screenshot']['f1-score'], unknown=cr['unknown']['f1-score'], abnormal=val_abnormal_f1_score) if (epoch + 1) % print_every == 0: print( '\nEpoch: {}\tTraining Loss: {:.4f}\tValidation Loss: {:.4f}' .format(epoch, train_loss, val_loss)) print( '\t\tTraining Accuracy: {:.2f}%\tValidation Accuracy: {:.2f}%' .format(100 * train_acc, 100 * val_acc)) print('\t\tClassification Report:{}'.format(cr)) print('\t\tF1 Score for Abnormal Class: {}'.format( val_abnormal_f1_score)) if val_abnormal_f1_score > val_abnormal_f1_score_max: nsml.save(str(epoch)) stop_count = 0 val_loss_min = val_loss val_max_acc = val_acc val_abnormal_f1_score_max = val_abnormal_f1_score best_epoch = epoch else: stop_count += 1 if stop_count >= early_stop: print( '\nEarly Stopping Total epochs: {}. Best epoch: {} with loss: {:.2f} and ac: {:.2f}% F1 Score for Abnormal Class: {}' .format(epoch, best_epoch, val_loss_min, 100 * val_acc, val_abnormal_f1_score_max)) return if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): scheduler.step(val_loss) model.optimizer = optimizer print( 'Best epoch: {} with loss: {:.2f} and ac: {:.2f}% f1-score: {}'.format( best_epoch, val_loss_min, 100 * val_acc, val_abnormal_f1_score_max)) return
def main(args, local): if args.arch == 'xDeepFM' and args.mode == 'train': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join(DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features] idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스 linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') print('---model defined---') # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까 print(time.time() - s ,'seconds') if use_nsml and args.mode == 'train': bind_nsml(model,[], args.task) if args.mode == 'test': print('_infer root - : ', DATASET_PATH) print('test') model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH) bind_nsml(model, [], args.task) checkpoint_session = ['401','team_62/airush2/176'] nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) print('successfully loaded') if (args.mode == 'train'): if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 nsml.save('infer') print('end') print('end_main') if args.pause: nsml.paused(scope=local)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--feature', type=str, default='mel', help='select feature extraction function. mel or log_mel ') args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py ; N_FFT = size of the Fourier Transform feature_size = N_FFT / 2 + 1 # N_FFT size = 512 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() # initial distribution of model weights for param in model.parameters(): param.data.uniform_(-0.08, 0.08) # make tensors able to be computed on multiple devices in parallel and copy tensors to GPU model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) # val ratio can be adjusted -> 10% ?? train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) # load train data train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() # train epoch train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) print('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() # eval for each epoch valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) print('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
def train(args, train_dataset, model, tokenizer, is_pretrain=False): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # Train! logger.info("***** Running {}training *****".format("Pre" if is_pretrain else "")) logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total %strain batch size (w. parallel, distributed & accumulation) = %d", "Pre" if is_pretrain else "", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to global_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) # Added here for reproducibility set_seed(args) best_f1, best_exact = -1, -1 for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert"]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: current_loss = (tr_loss - logging_loss) / args.logging_steps logging_loss = tr_loss if not is_pretrain: # Only evaluate when single GPU otherwise metrics may not average well if args.evaluate_during_training: logger.info("Validation start for epoch {}".format(epoch)) result = evaluate(args, model, tokenizer, prefix=epoch) _f1, _exact = result["f1"], result["exact"] is_best = _f1 > best_f1 best_f1 = max(_f1, best_f1) logger.info( "best_f1_val = {}, f1_val = {}, exact_val = {}, loss = {}, global_step = {}, " \ .format(best_f1, _f1, _exact, current_loss, global_step)) if IS_ON_NSML: nsml.report(summary=True, step=global_step, f1=_f1, exact=_exact, loss=current_loss) if is_best: nsml.save(args.model_type + "_best") else: logger.info("[{}] pretrain loss = {}".format(global_step, current_loss)) if IS_ON_NSML: nsml.report(summary=True, step=global_step, pretrain_loss=current_loss) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: if IS_ON_NSML: nsml.save(args.model_type + "{}_gs{}".format("_pre" if is_pretrain else "", global_step)) else: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break if is_pretrain and args.mix_qa: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if IS_ON_NSML: nsml.save(args.model_type + "_last") return global_step, tr_loss / global_step
""" Training loop """ for epoch in range(epochs): res = model.fit(x=[anchor_images, positive_images, negative_images], y=[0] * train_size, batch_size=batch_size, initial_epoch=epoch, epochs=epoch + 1, verbose=1, shuffle=True, validation_data=([ anchor_images_val, positive_images_val, negative_images_val ], [0] * val_size)) print(res.history) train_loss = res.history['loss'][0] val_loss = res.history['val_loss'][0] nsml.report(summary=True, step=epoch, epoch=epoch, epoch_total=epochs, loss=train_loss, val_loss=val_loss) if epoch % 5 == 0: nsml.save(epoch) nsml.save('saved!') print("done!") exit()
def main(): parser = argparse.ArgumentParser() # Required parameters, we defined additional arguments for experiment parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--load_cache", action="store_true", help="load data from cached session", ) parser.add_argument( "--save_cache", action="store_true", help="save loaded dataset into cache" ) parser.add_argument( "--cached_session_pretrain", default="", type=str, help="Path to cache where 'Span-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_pretrain_qa", default="", type=str, help="Path to cache where 'QA-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_train", default="", type=str, help="Path to cache where given 'training' dataset is stored", ) parser.add_argument( "--cached_session_dev", default="", type=str, help="Path to cache where given 'development set' is stored", ) parser.add_argument( "--load_model", action="store_true", help="use pretrained model from previous sessions", ) parser.add_argument( "--load_model_session", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--load_model_checkpoint", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--just_for_save", action="store_true", help="save checkpoint and terminate immediately", ) parser.add_argument( "--freeze_embedding", action="store_true", help="finetuning just classification layer", ) parser.add_argument( "--mix_qa", action="store_true", help="mix qa set for variance", ) parser.add_argument( "--mix_portion", type=float, default=0.5, help="defines portion of qa pairs to be reconstructed" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_pretrain_span", action="store_true", help="Whether to run span-pretraining.") parser.add_argument("--do_pretrain_qa", action="store_true", help="Whether to run qa-pretraining.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument("--do_initial_validation", action="store_true", help="Whether to run initial validation") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log' ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") # tokenizer.add_special_tokens({"additional_special_tokens" : ["[QUES]"]}) # print("vocabsize: {}".format(tokenizer.vocab_size)) # print("example") # print(tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]")) model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 0: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) model.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ logger.info("Training/evaluation parameters %s", args) # bind_nsml(model, tokenizer, args) if args.load_model: tmp_args = parser.parse_args() nsml.copy(args, tmp_args) nsml.load(checkpoint=args.load_model_checkpoint, session=args.load_model_session) nsml.copy(tmp_args, args) if args.just_for_save: nsml.save("test") return # initial validation if args.do_initial_validation: logger.info("Initinal Validation start") result = evaluate(args, model, tokenizer, prefix="") _f1, _exact = result["f1"], result["exact"] logger.info( "f1_val = {}, exact_val = {}" \ .format(_f1, _exact)) if IS_ON_NSML: nsml.report(summary=True, step=0, f1=_f1, exact=_exact) # 'Span' Pretraining if args.do_pretrain_span: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span") # 'QA' Pretraining if args.do_pretrain_qa: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=True) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span+qa") # Training if args.do_train: if args.freeze_embedding: for param in model.module.electra.parameters(): param.requires_grad = False t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False) t = time.time() - t logger.info("loading train data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def on_epoch_end(self, epoch, logs={}): nsml.report(summary=True, epoch=epoch, loss=logs.get('loss'), val_loss=logs.get('val_loss') ,acc=logs.get('acc'),val_acc=logs.get('val_acc') ,f1_score=logs.get('f1_score'),val_f1_score=logs.get('val_f1_score')) nsml.save(self.prefix +'_' +str(epoch))
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser( description='Speech hackathon lilililill model') parser.add_argument( '--max_epochs', type=int, default=1000, help='number of max epochs in training (default: 1000)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument('--lr', type=float, default=1e-03, help='learning rate (default: 0.001)') parser.add_argument('--num_mels', type=int, default=80, help='number of the mel bands (default: 80)') parser.add_argument('--batch_size', type=int, default=128, help='batch size in training (default: 128)') parser.add_argument("--num_thread", type=int, default=4, help='number of the loading thread (default: 4)') parser.add_argument('--num_hidden_enc', type=int, default=1024, help='hidden size of model (default: 1024)') parser.add_argument('--num_hidden_dec', type=int, default=512, help='hidden size of model decoder (default: 512)') parser.add_argument( '--nsc_in_ms', type=int, default=50, help='Number of sample size per time segment in ms (default: 50)') parser.add_argument( '--ref_repeat', type=int, default=1, help='Number of repetition of reference seq2seq (default: 1)') parser.add_argument('--loss_lim', type=float, default=0.05, help='Minimum loss threshold (default: 0.05)') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument('--memo', type=str, default='', help='Comment you wish to leave') parser.add_argument('--debug', type=str, default='False', help='debug mode') parser.add_argument('--load', type=str, default=None) args = parser.parse_args() batch_size = args.batch_size num_thread = args.num_thread num_mels = args.num_mels char2index, index2char = load_label('./hackathon.labels') SOS_token = char2index['<s>'] # '<sos>' or '<s>' EOS_token = char2index['</s>'] # '<eos>' or '</s>' PAD_token = char2index['_'] # '-' or '_' unicode_jamo_list = My_Unicode_Jamo_v2() # logger.info(''.join(unicode_jamo_list)) # logger.info('This is a new main2.py') tokenizer = Tokenizer(unicode_jamo_list) jamo_tokens = tokenizer.word2num(unicode_jamo_list) # logger.info('Tokens: {}'.format(jamo_tokens)) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') net = Mel2SeqNet_v2(num_mels, args.num_hidden_enc, args.num_hidden_dec, len(unicode_jamo_list), device) net_optimizer = optim.Adam(net.parameters(), lr=args.lr) ctc_loss = nn.CTCLoss().to(device) # net_B = Seq2SeqNet(512, jamo_tokens, char2index, device) ######### net_B = Seq2SeqNet_v2(1024, jamo_tokens, char2index, device) ######### net_B_optimizer = optim.Adam(net_B.parameters(), lr=args.lr) ######### net_B_criterion = nn.NLLLoss(reduction='none').to(device) ######### bind_model(net, net_B, net_optimizer, net_B_optimizer, index2char, tokenizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return if args.load != None: # nsml.load(checkpoint='saved', session='team47/sr-hack-2019-dataset/' + args.load) nsml.load(checkpoint='model', session='team47/sr-hack-2019-dataset/' + args.load) nsml.save('saved') for g in net_optimizer.param_groups: g['lr'] = 1e-06 for g in net_B_optimizer.param_groups: g['lr'] = 1e-06 for g in net_optimizer.param_groups: logger.info(g['lr']) for g in net_B_optimizer.param_groups: logger.info(g['lr']) wav_paths, script_paths, korean_script_paths = get_paths(DATASET_PATH) logger.info('Korean script path 0: {}'.format(korean_script_paths[0])) logger.info('wav_paths len: {}'.format(len(wav_paths))) logger.info('script_paths len: {}'.format(len(script_paths))) logger.info('korean_script_paths len: {}'.format(len(korean_script_paths))) # Load Korean Scripts korean_script_list, jamo_script_list = get_korean_and_jamo_list_v2( korean_script_paths) logger.info('Korean script 0: {}'.format(korean_script_list[0])) logger.info('Korean script 0 length: {}'.format(len( korean_script_list[0]))) logger.info('Jamo script 0: {}'.format(jamo_script_list[0])) logger.info('Jamo script 0 length: {}'.format(len(jamo_script_list[0]))) script_path_list = get_script_list(script_paths, SOS_token, EOS_token) ground_truth_list = [ (tokenizer.word2num(['<s>'] + list(jamo_script_list[i]) + ['</s>'])) for i in range(len(jamo_script_list)) ] # 90% of the data will be used as train split_index = int(0.95 * len(wav_paths)) wav_path_list_train = wav_paths[:split_index] ground_truth_list_train = ground_truth_list[:split_index] korean_script_list_train = korean_script_list[:split_index] script_path_list_train = script_path_list[:split_index] wav_path_list_eval = wav_paths[split_index:] ground_truth_list_eval = ground_truth_list[split_index:] korean_script_list_eval = korean_script_list[split_index:] script_path_list_eval = script_path_list[split_index:] logger.info('Total:Train:Eval = {}:{}:{}'.format(len(wav_paths), len(wav_path_list_train), len(wav_path_list_eval))) preloader_eval = Threading_Batched_Preloader_v2(wav_path_list_eval, ground_truth_list_eval, script_path_list_eval, korean_script_list_eval, batch_size, num_mels, args.nsc_in_ms, is_train=True) preloader_train = Threading_Batched_Preloader_v2(wav_path_list_train, ground_truth_list_train, script_path_list_train, korean_script_list_train, batch_size, num_mels, args.nsc_in_ms, is_train=False) best_loss = 1e10 best_eval_cer = 1e10 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) logger.info('start') train_begin = time.time() for epoch in range(args.max_epochs): logger.info((datetime.now().strftime('%m-%d %H:%M:%S'))) net.train() net_B.train() preloader_train.initialize_batch(num_thread) loss_list_train = list() seq2seq_loss_list_train = list() seq2seq_loss_list_train_ref = list() logger.info("Initialized Training Preloader") count = 0 total_dist = 0 total_length = 1 total_dist_ref = 0 total_length_ref = 1 while not preloader_train.end_flag: batch = preloader_train.get_batch() # logger.info(psutil.virtual_memory()) # logger.info("Got Batch") if batch is not None: # logger.info("Training Batch is not None") tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch pred_tensor, loss = train(net, net_optimizer, ctc_loss, tensor_input.to(device), ground_truth.to(device), length_list.to(device), device) loss_list_train.append(loss) #################################################### jamo_result = Decode_Prediction_No_Filtering( pred_tensor, tokenizer) true_string_list = Decode_Num_Script( batched_num_script.detach().cpu().numpy(), index2char) for i in range(args.ref_repeat): lev_input_ref = ground_truth lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_train( lev_input_ref.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_optimizer, net_B_criterion) pred_string_list_ref = Decode_Lev_Prediction( lev_pred_ref, index2char) seq2seq_loss_list_train_ref.append(seq2seq_loss_ref) dist_ref, length_ref = char_distance_list( true_string_list, pred_string_list_ref) pred_string_list = [None] dist = 0 length = 0 if (loss < args.loss_lim): lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor) lev_pred, attentions, seq2seq_loss = net_B.net_train( lev_input.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_optimizer, net_B_criterion) pred_string_list = Decode_Lev_Prediction( lev_pred, index2char) seq2seq_loss_list_train.append(seq2seq_loss) dist, length = char_distance_list(true_string_list, pred_string_list) total_dist_ref += dist_ref total_length_ref += length_ref total_dist += dist total_length += length count += 1 if count % 25 == 0: logger.info("Train: Count {} | {} => {}".format( count, true_string_list[0], pred_string_list_ref[0])) logger.info("Train: Count {} | {} => {} => {}".format( count, true_string_list[0], jamo_result[0], pred_string_list[0])) else: logger.info("Training Batch is None") # del preloader_train # logger.info(loss_list_train) train_loss = np.mean(np.asarray(loss_list_train)) train_cer = np.mean(np.asarray(total_dist / total_length)) train_cer_ref = np.mean(np.asarray(total_dist_ref / total_length_ref)) logger.info("Mean Train Loss: {}".format(train_loss)) logger.info("Total Train CER: {}".format(train_cer)) logger.info("Total Train Reference CER: {}".format(train_cer_ref)) preloader_eval.initialize_batch(num_thread) loss_list_eval = list() seq2seq_loss_list_eval = list() seq2seq_loss_list_eval_ref = list() logger.info("Initialized Evaluation Preloader") count = 0 total_dist = 0 total_length = 1 total_dist_ref = 0 total_length_ref = 1 net.eval() net_B.eval() while not preloader_eval.end_flag: batch = preloader_eval.get_batch() if batch is not None: tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch pred_tensor, loss = evaluate(net, ctc_loss, tensor_input.to(device), ground_truth.to(device), length_list.to(device), device) loss_list_eval.append(loss) #################### jamo_result = Decode_Prediction_No_Filtering( pred_tensor, tokenizer) true_string_list = Decode_Num_Script( batched_num_script.detach().cpu().numpy(), index2char) lev_input_ref = ground_truth lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_eval( lev_input_ref.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_criterion) pred_string_list_ref = Decode_Lev_Prediction( lev_pred_ref, index2char) seq2seq_loss_list_train_ref.append(seq2seq_loss_ref) dist_ref, length_ref = char_distance_list( true_string_list, pred_string_list_ref) lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor) lev_pred, attentions, seq2seq_loss = net_B.net_eval( lev_input.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_criterion) pred_string_list = Decode_Lev_Prediction(lev_pred, index2char) seq2seq_loss_list_train.append(seq2seq_loss) dist, length = char_distance_list(true_string_list, pred_string_list) total_dist_ref += dist_ref total_length_ref += length_ref total_dist += dist total_length += length count += 1 #################### if count % 10 == 0: logger.info("Eval: Count {} | {} => {}".format( count, true_string_list[0], pred_string_list_ref[0])) logger.info("Eval: Count {} | {} => {} => {}".format( count, true_string_list[0], jamo_result[0], pred_string_list[0])) else: logger.info("Training Batch is None") eval_cer = total_dist / total_length eval_cer_ref = total_dist_ref / total_length_ref eval_loss = np.mean(np.asarray(loss_list_eval)) logger.info("Mean Evaluation Loss: {}".format(eval_loss)) logger.info("Total Evaluation CER: {}".format(eval_cer)) logger.info("Total Evaluation Reference CER: {}".format(eval_cer_ref)) nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, train_epoch__cer_ref=train_cer_ref, eval__loss=eval_loss, eval__cer=eval_cer, eval__cer_ref=eval_cer_ref) nsml.save(args.save_name) best_model = (eval_cer < best_eval_cer) if best_model: nsml.save('best') best_eval_cer = eval_cer logger.info("Inference Check")
arg('--mode', type=str, default='train', help='submit일때 해당값이 test로 설정됩니다.') arg('--iteration', type=str, default='0', help='fork 명령어를 입력할때의 체크포인트로 설정됩니다. 체크포인트 옵션을 안주면 마지막 wall time 의 model 을 가져옵니다.') arg('--pause', type=int, default=0, help='model 을 load 할때 1로 설정됩니다.') arg('--SEED', type=int, default=43) arg('--model', type=str, default='efficientnet_b3') arg('--input_size', type=int, default=512) arg('--batch_size', type=int, default=8) arg('--num_workers', type=int, default=4) arg('--valid_augments', default='horizontal_flip, random_rotate', type=str) arg('--augment_ratio', default=0.5, type=float, help='probability of implementing transforms') arg('--tta', type=int, default=1, help='test time augmentation') arg('--pretrained', default=False, type=bool, help='download pretrained model') arg('--num_classes', type=int, default=1) arg('--power', type=int, default=1) args = parser.parse_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("device", device) global model model = build_model(args, device) bind_model(model, args) if args.mode == 'train': nsml.save('sillim') if args.pause: print('Inferring Start...') nsml.paused(scope=locals())
class_mode="categorical", shuffle=True, seed=42) """ Callback """ monitor = 'acc' reduce_lr = ReduceLROnPlateau(monitor=monitor, patience=3) """ Training loop """ STEP_SIZE_TRAIN = train_generator.n // train_generator.batch_size t0 = time.time() for epoch in range(nb_epoch): t1 = time.time() res = model.fit_generator(generator=train_generator, steps_per_epoch=STEP_SIZE_TRAIN, initial_epoch=epoch, epochs=epoch + 1, callbacks=[reduce_lr], verbose=1, shuffle=True) t2 = time.time() print(res.history) print('Training time for one epoch : %.1f' % ((t2 - t1))) train_loss, train_acc = res.history['loss'][0], res.history['acc'][ 0] nsml.report(summary=True, epoch=epoch, epoch_total=nb_epoch, loss=train_loss, acc=train_acc) nsml.save(epoch) print('Total training time : %.1f' % (time.time() - t0))
label_vector = to_numpy(tags) bool_vector = predict_vector == label_vector valid_loss += loss.item() / len(valid_loader) valid_total_correct += bool_vector.sum() elapsed = time.time() - start_time train_acc = train_total_correct / len(train_loader.dataset) val_acc = valid_total_correct / len(valid_loader.dataset) # best val_acc checkpoint if val_acc > best_val_acc: print("val_acc has improved") best_val_acc = val_acc nsml.save('best_acc') else: print("val_acc has not improved") lr = [_['lr'] for _ in optimizer.param_groups] if args.scheduler == 'plateau': scheduler.step(val_acc) else: scheduler.step() nsml.save(epoch_idx) print( "Epoch {}/{} train_loss: {:.5f} valid_loss {:.5f} train_acc: {:.3f} valid_acc: {:.3f} lr: {:.6f} elapsed: {:.0f}" .format(epoch_idx, args.epochs, train_loss, valid_loss,
default='0', help= 'fork 명령어를 입력할때의 체크포인트로 설정됩니다. 체크포인트 옵션을 안주면 마지막 wall time 의 model 을 가져옵니다.' ) args.add_argument('--pause', type=int, default=0, help='model 을 load 할때 1로 설정됩니다.') config = args.parse_args() # base model architecture base_model = "vgg16" model = util.select_base_model(base_model) # new architecture code here model.summary() # bind model bind_model(model) if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': bTrainmode = True # load weights nsml.load(checkpoint=base_model, session=util.model_name2session(base_model)) nsml.save('saved') exit()
def train(self, args): """Train cyclegan""" self.lr = tf.placeholder(tf.float32, None, name='learning_rate') self.d_optim = tf.train.AdamOptimizer(self.lr, beta1=args.beta1) \ .minimize(self.d_loss, var_list=self.d_vars) self.g_optim = tf.train.AdamOptimizer(self.lr, beta1=args.beta1) \ .minimize(self.g_loss, var_list=self.g_vars) init_op = tf.global_variables_initializer() self.sess.run(init_op) self.writer = tf.summary.FileWriter("./logs", self.sess.graph) counter = 1 start_time = time.time() if args.continue_train: if self.load(args.checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed...") # ./datasets/face2cartoon/ for epoch in range(args.epoch): dataA = glob('{}{}/*.*'.format(self.data_path, self.dataset_dir + '/trainA')) dataB = glob('{}{}/*.*'.format(self.data_path, self.dataset_dir + '/trainB')) np.random.shuffle(dataA) np.random.shuffle(dataB) batch_idxs = min(min(len(dataA), len(dataB)), args.train_size) // self.batch_size lr = args.lr if epoch < args.epoch_step else args.lr * ( args.epoch - epoch) / (args.epoch - args.epoch_step) for idx in range(0, batch_idxs): batch_files = list( zip( dataA[idx * self.batch_size:(idx + 1) * self.batch_size], dataB[idx * self.batch_size:(idx + 1) * self.batch_size])) batch_images = [ load_train_data(batch_file, args.load_size, args.fine_size) for batch_file in batch_files ] batch_images = np.array(batch_images).astype(np.float32) # Update G network and record fake outputs fake_A, fake_B, _, summary_str, gan_loss, L1_loss = self.sess.run( [ self.fake_A, self.fake_B, self.g_optim, self.g_sum, self.gan_loss, self.L1_loss ], feed_dict={ self.real_data: batch_images, self.lr: lr }) self.writer.add_summary(summary_str, counter) [fake_A, fake_B] = self.pool([fake_A, fake_B]) # Update D network _, summary_str = self.sess.run( [self.d_optim, self.d_sum], feed_dict={ self.real_data: batch_images, self.fake_A_sample: fake_A, self.fake_B_sample: fake_B, self.lr: lr }) self.writer.add_summary(summary_str, counter) counter += 1 if np.mod(idx, 10) == 0: print(("Epoch: [%2d] [%4d/%4d] time: %4.4f" % (epoch, idx, batch_idxs, time.time() - start_time))) print("GAN_loss: {0:.6f} \tL1_loss: {1:.6f}".format( gan_loss, L1_loss)) if np.mod(counter, args.print_freq) == 0: # self.sample_model(args.sample_dir, epoch, idx) self.visualize(args.sample_dir, counter) if np.mod(counter, args.save_freq) == 0: self.save(args.checkpoint_dir, counter) if args.nsml == True: nsml.save(epoch)
best_min_first_K = min_first_1_at_K best_min_first_K_step = step print("----> First_K @ 1 recall : %d / %d" % (min_first_1_at_K, len(mean_recall_at_K))) do_save = True if best_mAP <= mAP: best_mAP = mAP print( "----> Best mAP : best-mAP {:g}".format(best_mAP)) do_save = True if epoch - prev_epoch == 1: print("----> Epoch changed saving") do_save = True if do_save: # save model nsml.report(summary=True, epoch=str(step), epoch_total=nb_epoch) nsml.save(step) print("Model saved : %d step" % step) print( "=============================================================================================================" ) except tf.errors.OutOfRangeError: print("finish train!") break
loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.initial_lr) for e_i in range(config.max_epoch): sum_loss = 0 accurate_pred = 0 num_of_instance = 0 for i, x_y_pair in enumerate(train_loader): x, y = x_y_pair logit = model(x) loss = loss_fn(logit, y) loss.backward() optimizer.step() optimizer.zero_grad() prob = F.softmax(logit, dim=1) pred = torch.argmax(prob, dim=1) accurate_pred += to_np((pred == y).sum()) num_of_instance += y.shape[0] sum_loss += float(to_np(loss)) b_acc = to_np( torch.mean(torch.eq(pred, y).float()) ) print("this batch acc:", b_acc, "total correct answer:", accurate_pred, "total instances:", num_of_instance) accuracy = (accurate_pred / num_of_instance) nsml.report(**{"summary":True, "step":e_i, "scope":locals(), "train__loss:":float(sum_loss), "train__accuracy:":float(accuracy)}) print(e_i,"'th epoch acc:", accuracy, "correct answer:", accurate_pred, "total instances:", num_of_instance, "loss:", sum_loss) nsml.save(e_i)