def __init__(self, vocab_size, data_dir, mode): self.sp = spm.SentencePieceProcessor() self.vocab_size = vocab_size if mode == 'train': self.data_file_path = os.path.join(data_dir, 'train/train_data/train_data') def save_and_train_tokenizer(path, *args, **kwargs): with open(self.data_file_path) as f: data = f.read().splitlines() corpus = list() for line in data: corpus += line.split("\t")[1:] with open('corpus', 'w') as wf: for text in corpus: wf.write("%s\n" % text) templates = '--input={} --model_prefix={} --vocab_size={} --hard_vocab_limit=false' spm.SentencePieceTrainer.Train( templates.format('corpus', os.path.join(path, 'SPM'), self.vocab_size)) self.sp.Load(os.path.join(path, 'SPM.model')) nsml.save('tokenizer', save_fn=save_and_train_tokenizer) elif mode == 'test': def load_tokenizer(path, *args, **kwargs): self.sp.Load(os.path.join(path, 'SPM.model')) nsml.load(checkpoint='tokenizer', load_fn=load_tokenizer, session=NSML_SESSION)
def test(config): NSML_SESSEION = 'team_6/19_tcls_qa/80' # NOTE: need to hard code NSML_CHECKPOINT = '13800' # NOTE: nghhhhed to hard code assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit" assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit" set_global_seed(config.seed_num) token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers data_reader = create_by_factory(DataReaderFactory, config.data_reader) def bind_load_vocabs(config, token_makers): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) vocabs = {} token_config = config.token for token_name in token_config.names: token = getattr(token_config, token_name, {}) vocab_config = getattr(token, "vocab", {}) texts = checkpoint["vocab_texts"][token_name] if type(vocab_config) != dict: vocab_config = vars(vocab_config) vocabs[token_name] = Vocab(token_name, **vocab_config).from_texts(texts) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return token_makers nsml.bind(load=load) bind_load_vocabs(config, token_makers) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) # Raw to Tensor Function text_handler = TextHandler(token_makers, lazy_indexing=False) raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=device, ) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device) trainer = Trainer(model, metric_key="f1") if nsml.IS_ON_NSML: bind_nsml(model, trainer=trainer, raw_to_tensor_fn=raw_to_tensor_fn) if config.nsml.pause: nsml.paused(scope=locals())
def _infer(model, root_path, test_loader=None): if test_loader is None: test_loader = data_loader(root=os.path.join(root_path, 'test_data'), phase='test') ensembles_xy = [] ensembles_w = [] for sess, chkp, w in archives: nsml.load(checkpoint=chkp, session=sess) model.eval() outputs = [] outputs_w = [] num_data = 0 for idx, (image, _) in enumerate(test_loader): with torch.no_grad(): locs, scores = model(image.cuda()) all_images_boxes, all_scores = model.detect_objects( locs, scores) for box in all_images_boxes: box = box.detach().cpu().numpy() box_xy = np.array( [box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32) outputs.append(box_xy) outputs_w.extend(all_scores) num_data += len(all_images_boxes) ensembles_xy.append(np.array(outputs)) ensembles_w.append(outputs_w) # ensembles_xy = np.mean(ensembles_xy, axis=0) ensemble_result = [None] * len(ensembles_xy[0]) best_w = defaultdict(lambda: 0) for xys, ws in zip(ensembles_xy, ensembles_w): for i, (xy, w) in enumerate(zip(xys, ws)): if best_w[i] > w: continue ensemble_result[i] = xy best_w[i] = w ensembles_xy = np.array(ensemble_result) print(ensembles_xy.shape) assert ensembles_xy.shape[0] == num_data assert ensembles_xy.shape[1] == 4 ensembles = [] for xy in ensembles_xy: box = np.array([xy[0], xy[1], xy[2] - xy[0], xy[3] - xy[1]]) ensembles.append(box) outputs = np.stack(ensembles, axis=0) assert outputs.shape[0] == num_data assert outputs.shape[1] == 4 print(outputs.shape) return outputs
def _infer(model, data): start = time.time() ################################################################################ print('test preprocessing start!') # # data: [a, b, c,...] data_bc = [] bc_func, _ = preprocess_dict["ben_clahe"] for d in data: d = cv2.resize(d, (704, 544)) data_bc.append(bc_func(d)) # del d # del data ellapsed = time.time() - start print('test preprocessing time: %d hours %d minutes %d seconds' % (ellapsed // 3600, (ellapsed % 3600) // 60, (ellapsed % 3600) % 60)) print('test preprocessing ended!') del data ################################################################################ # n_ensemble = len(ensemble_checkpoints) final = [] for sess, ckpt, config_path in ensemble_checkpoints: config = utils.config.load(config_path) model = get_model(config).cuda() bind_model(model) nsml.load(checkpoint=ckpt, session=sess) # data_processed = [] # _func, _ = preprocess_dict[config.DATA.PREPROCESS] # for d in data: # d = cv2.resize(d, (config.DATA.IMG_W, config.DATA.IMG_H)) # data_processed.append(_func(d)) out = run(model, data_bc, config) final.append(out) del model # final = sum(final) / float(n_ensemble) final = sum(final) final = np.argmax(final, axis=1) print(final.shape) print(final) ellapsed = time.time() - start print('Total inference time: %d hours %d minutes %d seconds' % (ellapsed // 3600, (ellapsed % 3600) // 60, (ellapsed % 3600) % 60)) return final
def model_infer(image_path, args): # fix seed for train reproduction seed_everything(args.SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') default_transforms = transforms.Compose([transforms.Resize(args.input_size)]) test_tranfsorms = get_transform(target_size=(args.input_size, args.input_size), transform_list=args.valid_augments, augment_ratio=args.augment_ratio, is_train=False) test_dataset = PathDataset(image_paths=image_path, labels=None, default_transforms=default_transforms, transforms=test_tranfsorms, is_test=True) test_loader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) total_pred = [] # ensemble models for i, (load_session, load_checkpoint) in enumerate(model_list): try: nsml.load(checkpoint=load_checkpoint, session=load_session) print(f'{i}th model loaded {load_session} {load_checkpoint}') except: print(f'{i}th model load cancel') model.to(device) model.eval() fold_pred = [] # test time augmentation for _ in range(args.tta): tta_pred = [] with torch.no_grad(): for i, images in enumerate(test_loader): output = torch.sigmoid(model(images.to(device))).cpu().detach().numpy() tta_pred.append(output) tta_pred = np.concatenate(tta_pred).flatten() fold_pred.append(tta_pred) total_pred.append(np.array(fold_pred)) total_pred = np.concatenate(total_pred) total_pred = np.mean(total_pred**args.power, axis=0) threshold = 0.5 total_pred = np.where(np.array(total_pred) >= threshold, 1, 0) total_pred = total_pred.astype(np.int64) return total_pred
def _infer(root, phase, model, task): # root : csv file path # change soon print('_infer root - : ', root) # - 14s - loss: 0.1703 - acc: 0.7645 #Epoch 11/100 #epoch: 10, train_acc: 0.764603060570993 #model, fixlen_feature_names_global, item = get_xDeepFM() #global fixlen_feature_names_global model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item( root) #bind_nsml(model) #bind_nsml(model, [], args.task) print('--get item finished---') checkpoint_session = ['3', 'team_62/airush2/258'] nsml.load(checkpoint=str(checkpoint_session[0]), session=str(checkpoint_session[1])) print('-- model_load completed --') s = time.time() data_1_article_idxs = item['article_id'].tolist() li = [] for i in range(len(data_1_article_idxs)): image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]] li.append(image_feature) print('------------is same image picture? let me check---------------') print('article_id : ', '757518f4a3da') print('article_if : ', image_feature_dict['757518f4a3da']) print('--------------------------------------------------------------') item['image_feature'] = li li = [] print(f'finished data_1_image_feature : {time.time() - s} sec') test_generator = data_generator_test(item) # 맞확 predicts = model.predict_generator(test_generator, steps=len(item), workers=4) print(f'y_pred shape : {predicts.shape}') print(f'y_pred type : {type(predicts)}') print(predicts) predicts = predicts.reshape((len(item), )) pl = predicts.tolist() print(pl[:50]) print(pl[-50:]) #print(predicts) return predicts
def self_training(self): self.data.prepare() dataset = 'train' img_list, src_dir_list = self.data.get_unlabeled_data(dataset) nsml.load(checkpoint='best', session='Ye-Ji-Kim/spam-3/87') for idx, img in enumerate(img_list): img = np.array([img]) predict =self.network.predict(img) predict = predict[0] if np.max(predict) > 0.90: if np.argmax(predict) != 0: self.data.save_unlabeled_data(dataset, src_dir_list[idx], np.argmax(predict)) print(f'save in {np.argmax(predict)}') self.data.show_data_size(dataset)
def model_infer(data): batch_size = 1 num_workers = 4 target_size = (384, 384) test_transforms = transforms.Compose([ transforms.CenterCrop(2000), transforms.Resize(target_size), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) test_dataset = TestDataset(data, test_transforms) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) load_list = (('team012/KHD2019_FUNDUS/307', 'best_acc_4'), ('team012/KHD2019_FUNDUS/321', 'best_acc_4')) last_pred = [] for load_session, load_checkpoint in load_list: try: nsml.load(checkpoint=load_checkpoint, session=load_session) except: print('load cancel') model.to(device) model.eval() preds = np.zeros((len(test_loader.dataset), args.num_classes)) with torch.no_grad(): for i, image in enumerate(test_loader): image = image.to(device) output = model(image) # output shape (batch_num, num_classes) preds[i * batch_size:(i + 1) * batch_size] = output.detach().cpu().numpy() last_pred.append(preds) last_pred = np.mean(last_pred, axis=0) print(last_pred.shape) predictions = np.argmax(last_pred, axis=1) print(predictions) return predictions
def train(experiment_name: str = 'v1', pause: bool = False, mode: str = 'train'): config = import_module(f'spam.training.experiments.{experiment_name}').config model = config['model'](**config['model_kwargs']) bind_model(model) if pause: nsml.paused(scope=locals()) if mode == 'train': # nsml.load(checkpoint='last_layer_tuning', session='hi-space/spam-2/14') # nsml.load(checkpoint='best', session='hi-space/spam-1/147') nsml.load(checkpoint='full_tuning_21', session='hi-space/spam-3/3') nsml.save('best') print('best model saved') # exit() print('-----------') print(config) print('-----------') model.fit(**config['fit_kwargs'])
def infer(test_image_data_path, test_meta_data_path): # DONOTCHANGE This Line test_meta_data = pd.read_csv(test_meta_data_path, delimiter=',', header=0) input_size=224 # you can change this according to your model. batch_size=200 # you can change this. But when you use 'nsml submit --test' for test infer, there are only 200 number of data. device = 0 we = 0.25 ensemble = [['team_62/airush1/320', '02'],['team_62/airush1/320','12'],['team_62/airush1/320','22'],['team_62/airush1/320','32']] #ensemble = [['team_62/airush1/415', '03'],['team_62/airush1/415','13'],['team_62/airush1/415','23'],['team_62/airush1/415','33']] predict_list = [] for i in range(4): dataloader = DataLoader( AIRushDataset(test_image_data_path, test_meta_data, label_path=None, transform=transforms.Compose([transforms.Resize((input_size, input_size)), transforms.RandomRotation(20),transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])), batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True) # Let's do ensemble!!! nsml.load(checkpoint=str(ensemble[i][1]),session=str(ensemble[i][0])) # model load model_nsml.to(device) model_nsml.eval() predict_output_list = [] with torch.no_grad(): for batch_idx, image in enumerate(dataloader): image = image.to(device) output = model_nsml(image).double() output_prob = to_np(F.softmax(output, dim=1)) predict_output_list.append(output_prob * we) predict_output_list = np.concatenate(predict_output_list,axis=0) predict_list.append(predict_output_list) predict_vector = np.argmax(np.sum(predict_list,axis=0), axis=1) return predict_vector # this return type should be a numpy array which has shape of (138343)
def main(): seed_everything() config = utils.config.load(ensemble_checkpoints[0][2]) model = get_model(config).cuda() bind_model(model) args = get_args() if args.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if args.mode == 'train': ### training mode일 때 print('Training Start...') nsml.load(session=ensemble_checkpoints[0][0], checkpoint=ensemble_checkpoints[0][1]) nsml.save(0) exit()
def main(): seed_everything() pprint.pprint(config, indent=2) model = get_model(config).cuda() bind_model(model) args = get_args() if args.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if args.mode == 'train': ### training mode일 때 print('Training Start...') nsml.load(checkpoint='18', session='team146/KHD2019_FUNDUS/20') nsml.save(0) exit()
def fit(self, epochs_finetune, epochs_full, batch_size, debug=False): sessionName = 'qkek984/spam-3/59' nsml.load(checkpoint='best', session=sessionName) print(sessionName, "model load!") #nsml.save(checkpoint='saved') #exit() self.debug = debug self.data.prepare(unlabeledset=True) print("lenunlabeled : ", self.data.lenUnlabeled('unlabeled')) # check unlabeldata self.network.compile( loss=self.loss(), optimizer=self.optimizer('full'), metrics=self.fit_metrics() ) val_gen = self.data.ST_val_gen(batch_size) self.myMetrics(val_gen=val_gen, batch_size=batch_size) # do self training return self.data.base_dir
def _infer(model, root_path, test_loader=None): if test_loader is None: test_loader = data_loader(root=os.path.join(root_path, 'test_data'), phase='test') res_fcs = [] for sess, chkp, w in archives: nsml.load(checkpoint=chkp, session=sess) model.eval() res_fc = None res_id = None for idx, (data_id, image, _) in enumerate(tqdm(test_loader)): image = image.cuda() with torch.no_grad(): fc = model(image) fc = fc.detach().cpu().numpy() fc = np_softmax(fc) # with torch.no_grad(): # fc2 = model(torch.flip(image, (3, ))) # TTA : horizontal flip # fc2 = fc2.detach().cpu().numpy() # fc2 = np_softmax(fc2) # fc = fc + fc2 if C.get()['infer_mode'] == 'face': fc[:, range(60)] = -1 # target_lb = list(range(60, 100)) if idx == 0: res_fc = fc res_id = data_id else: res_fc = np.concatenate((res_fc, fc), axis=0) res_id = res_id + data_id res_fcs.append(res_fc * w) res_cls = np.argmax(np.sum(res_fcs, axis=0), axis=1) return [res_id, res_cls]
def load_finetuned(self): for model, sess_ in zip(self.models, self.session): print(sess_) if model: bind_model(model) nsml.load(checkpoint='best', session=sess_) model = model.cuda() if self.mode == 'xgb': for model in self.models: if model: for name, param in model.named_parameters(): param.requires_grad = False else: for model in self.models: for name, param in model.named_parameters(): if 'fc' in name and self.mode == 'soft': param.requires_grad = True else: param.requires_grad = False print("Pretrained weight loaded! ") bind_ensemble_model(self)
# opt = optimizers.SGD(lr=learning_rate, momentum=0.9, nesterov=True) # opt = optimizers.adamax(lr=learning_rate) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy']) bind_model(model) if config.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if config.mode == 'train': ### training mode일 때 print('Training Start...') img_path = DATASET_PATH + '/train/' if config.load_model: nsml.load(checkpoint=config.load_model_ckpt, session=config.load_model) if nb_epoch == 0: nsml.save("zero") exit() if config.load_from: # Load From Saved Session data = {} def nsml_load(dir_path, **kwargs): images = np.load(os.path.join(dir_path, 'data_x.npy')) labels = np.load(os.path.join(dir_path, 'data_y.npy')) data['x'] = images data['y'] = labels print("Data Loaded!!!") nsml.load(checkpoint='data', load_fn=nsml_load, session=config.load_from)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=24, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log' ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator") model_SC = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator") model_QA = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-small-v3-discriminator") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model_SC.to(args.device) model_QA.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model_SC, model_QA, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ #Before loading, save models using 'run_save_model.py' to gather models in separate sessions. nsml.load(checkpoint='saved', session="kaist006/korquad-open-ldbd3/160") nsml.save('best_model') logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") if args.do_eval: result = evaluate(args, model_SC, model_QA, tokenizer) _f1, _exact = result["f1"], result["exact"] print('f1: {}, exact: {}'.format(_f1, _exact))
# model setting ## 반드시 이 위치에서 로드해야함 model1 = build_xception() model2 = build_xception() model3 = build_xception2() # Loss and optimizer ''' model.compile(tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy', recall, precision, f1, sp, ntv, custom]) ''' ############ DONOTCHANGE ############### bind_model(model1) nsml.load(checkpoint='19', session='KHD032/Breast_Pathology/392') bind_model(model2) nsml.load(checkpoint='29', session='KHD032/Breast_Pathology/336') bind_model(model3) nsml.load(checkpoint='62', session='KHD032/Breast_Pathology/223') alpha = 1.2 input_ = tf.keras.Input(shape=(299, 299, 3)) m1 = model1(input_) m2 = model2(input_) m3 = model3(input_) m3_out = tf.keras.layers.concatenate([1 - m3, m3 + alpha]) #m3_out = tf.keras.layers.concatenate([1 - m3, m3]) out = tf.keras.layers.add([m1, m2, m3_out]) #out = tf.keras.layers.add([m1, m2, m3])
img_list = pickle.load(img_f) with open(output_path[1], 'rb') as label_f: label_list = pickle.load(label_f) mean_arr = None # np.zeros(input_shape) #for img in img_list: # mean_arr += img.astype('float32') #mean_arr /= len(img_list) #print('mean shape:',mean_arr.shape, 'mean mean:',mean_arr.mean(), 'mean max:',mean_arr.max()) #mean_arr /= 255 #np.save('./mean.npy', mean_arr) if config.pause: nsml.paused(scope=locals()) bTrainmode = False if config.mode == 'train': bTrainmode = True #nsml.load(checkpoint='86', session='Zonber/ir_ph1_v2/204') #Nasnet Large 222 nsml.load(checkpoint='0', session='Zonber/ir_ph2/222') #InceptionResnetV2 222 print('convert start model') intermediate_layer_model = Model( inputs=model.input[0], outputs=model.get_layer('triplet_loss_layer').input[0]) model_r = reduce_keras_model(intermediate_layer_model) model_r.summary() print('convert complete reduce model') bind_model(model_r) print('binde reduce model complete') nsml.save(0) # this is display model name at lb
def infer(test_image_data_path, test_meta_data_path): # DONOTCHANGE This Line test_meta_data = pd.read_csv(test_meta_data_path, delimiter=',', header=0) device = 0 models = args.models.split(",") model_weights = [float(w) for w in args.model_weights.split(",")] nsml_sessionss = args.nsml_sessionss.split(",") nsml_checkpoints = args.nsml_checkpoints.split(",") loss_types = args.loss_types.split(",") transform_random_crop = args.transform_random_crop.split(",") transform_random_sized_crop = args.transform_random_sized_crop.split( ",") transform_norm = args.transform_norm.split(",") infer_transform_center_crop = args.infer_transform_center_crop.split( ",") total_output_probs = None for i, model_name in enumerate(models): batch_size = batch_size_map[model_name] // 2 infer_transform_list = [] if infer_transform_center_crop[i] == "True": infer_transform_list.append(transforms.Resize((248, 248))) infer_transform_list.append( transforms.CenterCrop((args.input_size, args.input_size))) infer_transform_list.append(transforms.ToTensor()) if transform_norm[i] == "True": infer_transform_list.append( transforms.Normalize( [0.44097832, 0.44847423, 0.42528335], [0.25748107, 0.26744914, 0.30532702])) else: if transform_random_crop[i] == "True": infer_transform_list.append(transforms.Resize((256, 256))) infer_transform_list.append( transforms.CenterCrop( (args.input_size, args.input_size))) elif transform_random_sized_crop[i] == "True": infer_transform_list.append(transforms.Resize((256, 256))) infer_transform_list.append( transforms.CenterCrop( (args.input_size, args.input_size))) else: infer_transform_list.append( transforms.Resize((args.input_size, args.input_size))) infer_transform_list.append(transforms.ToTensor()) if transform_norm[i] == "True": infer_transform_list.append( transforms.Normalize( [0.44097832, 0.44847423, 0.42528335], [0.25748107, 0.26744914, 0.30532702])) print("transform", infer_transform_list) dataloader = DataLoader( AIRushDataset( test_image_data_path, test_meta_data, label_path=None, transform=transforms.Compose(infer_transform_list) ), #[transforms.Resize((args.input_size, args.input_size)), transforms.ToTensor()])), batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True) if model_name == "Resnet18": model = Resnet18(args.output_size) elif model_name == "Resnet152": model = Resnet152(args.output_size) elif model_name == "baseline": model = Baseline(args.hidden_size, args.output_size) elif model_name.split("-")[0] == "efficientnet": model = EfficientNet.from_pretrained(args.model, args.output_size) else: raise Exception("model type is invalid : " + args.model) model.to(device) def load_fn(dir_name): save_state_path = os.path.join(dir_name, 'state_dict.pkl') state = torch.load(save_state_path) model.load_state_dict(state['model']) print("model loaded", dir_name) model.eval() nsml.load(checkpoint=nsml_checkpoints[i], load_fn=load_fn, session="team_13/airush1/" + nsml_sessionss[i]) output_probs = None for batch_idx, image in enumerate(dataloader): image = image.to(device) output = model(image).double() if loss_types[i] == "cross_entropy": output_prob = F.softmax(output, dim=1) else: output_prob = torch.sigmoid(output) if output_probs is None: output_probs = to_np(output_prob) else: output_probs = np.concatenate( [output_probs, to_np(output_prob)], axis=0) if total_output_probs is None: total_output_probs = output_probs * model_weights[i] else: total_output_probs += (output_probs * model_weights[i]) predict = np.argmax(total_output_probs, axis=1) return predict # this return type should be a numpy array which has shape of (138343)
default='0', help= 'fork 명령어를 입력할때의 체크포인트로 설정됩니다. 체크포인트 옵션을 안주면 마지막 wall time 의 model 을 가져옵니다.' ) args.add_argument('--pause', type=int, default=0, help='model 을 load 할때 1로 설정됩니다.') config = args.parse_args() # base model architecture base_model = "vgg16" model = util.select_base_model(base_model) # new architecture code here model.summary() # bind model bind_model(model) if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': bTrainmode = True # load weights nsml.load(checkpoint=base_model, session=util.model_name2session(base_model)) nsml.save('saved') exit()
def main(): parser = argparse.ArgumentParser() # Required parameters, we defined additional arguments for experiment parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--load_cache", action="store_true", help="load data from cached session", ) parser.add_argument( "--save_cache", action="store_true", help="save loaded dataset into cache" ) parser.add_argument( "--cached_session_pretrain", default="", type=str, help="Path to cache where 'Span-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_pretrain_qa", default="", type=str, help="Path to cache where 'QA-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_train", default="", type=str, help="Path to cache where given 'training' dataset is stored", ) parser.add_argument( "--cached_session_dev", default="", type=str, help="Path to cache where given 'development set' is stored", ) parser.add_argument( "--load_model", action="store_true", help="use pretrained model from previous sessions", ) parser.add_argument( "--load_model_session", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--load_model_checkpoint", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--just_for_save", action="store_true", help="save checkpoint and terminate immediately", ) parser.add_argument( "--freeze_embedding", action="store_true", help="finetuning just classification layer", ) parser.add_argument( "--mix_qa", action="store_true", help="mix qa set for variance", ) parser.add_argument( "--mix_portion", type=float, default=0.5, help="defines portion of qa pairs to be reconstructed" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_pretrain_span", action="store_true", help="Whether to run span-pretraining.") parser.add_argument("--do_pretrain_qa", action="store_true", help="Whether to run qa-pretraining.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument("--do_initial_validation", action="store_true", help="Whether to run initial validation") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log' ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") # tokenizer.add_special_tokens({"additional_special_tokens" : ["[QUES]"]}) # print("vocabsize: {}".format(tokenizer.vocab_size)) # print("example") # print(tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]")) model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 0: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) model.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ logger.info("Training/evaluation parameters %s", args) # bind_nsml(model, tokenizer, args) if args.load_model: tmp_args = parser.parse_args() nsml.copy(args, tmp_args) nsml.load(checkpoint=args.load_model_checkpoint, session=args.load_model_session) nsml.copy(tmp_args, args) if args.just_for_save: nsml.save("test") return # initial validation if args.do_initial_validation: logger.info("Initinal Validation start") result = evaluate(args, model, tokenizer, prefix="") _f1, _exact = result["f1"], result["exact"] logger.info( "f1_val = {}, exact_val = {}" \ .format(_f1, _exact)) if IS_ON_NSML: nsml.report(summary=True, step=0, f1=_f1, exact=_exact) # 'Span' Pretraining if args.do_pretrain_span: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span") # 'QA' Pretraining if args.do_pretrain_qa: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=True) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span+qa") # Training if args.do_train: if args.freeze_embedding: for param in model.module.electra.parameters(): param.requires_grad = False t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False) t = time.time() - t logger.info("loading train data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser( description='Speech hackathon lilililill model') parser.add_argument( '--max_epochs', type=int, default=1000, help='number of max epochs in training (default: 1000)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument('--lr', type=float, default=1e-03, help='learning rate (default: 0.001)') parser.add_argument('--num_mels', type=int, default=80, help='number of the mel bands (default: 80)') parser.add_argument('--batch_size', type=int, default=128, help='batch size in training (default: 128)') parser.add_argument("--num_thread", type=int, default=4, help='number of the loading thread (default: 4)') parser.add_argument('--num_hidden_enc', type=int, default=1024, help='hidden size of model (default: 1024)') parser.add_argument('--num_hidden_dec', type=int, default=512, help='hidden size of model decoder (default: 512)') parser.add_argument( '--nsc_in_ms', type=int, default=50, help='Number of sample size per time segment in ms (default: 50)') parser.add_argument( '--ref_repeat', type=int, default=1, help='Number of repetition of reference seq2seq (default: 1)') parser.add_argument('--loss_lim', type=float, default=0.05, help='Minimum loss threshold (default: 0.05)') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument('--memo', type=str, default='', help='Comment you wish to leave') parser.add_argument('--debug', type=str, default='False', help='debug mode') parser.add_argument('--load', type=str, default=None) args = parser.parse_args() batch_size = args.batch_size num_thread = args.num_thread num_mels = args.num_mels char2index, index2char = load_label('./hackathon.labels') SOS_token = char2index['<s>'] # '<sos>' or '<s>' EOS_token = char2index['</s>'] # '<eos>' or '</s>' PAD_token = char2index['_'] # '-' or '_' unicode_jamo_list = My_Unicode_Jamo_v2() # logger.info(''.join(unicode_jamo_list)) # logger.info('This is a new main2.py') tokenizer = Tokenizer(unicode_jamo_list) jamo_tokens = tokenizer.word2num(unicode_jamo_list) # logger.info('Tokens: {}'.format(jamo_tokens)) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') net = Mel2SeqNet_v2(num_mels, args.num_hidden_enc, args.num_hidden_dec, len(unicode_jamo_list), device) net_optimizer = optim.Adam(net.parameters(), lr=args.lr) ctc_loss = nn.CTCLoss().to(device) # net_B = Seq2SeqNet(512, jamo_tokens, char2index, device) ######### net_B = Seq2SeqNet_v2(1024, jamo_tokens, char2index, device) ######### net_B_optimizer = optim.Adam(net_B.parameters(), lr=args.lr) ######### net_B_criterion = nn.NLLLoss(reduction='none').to(device) ######### bind_model(net, net_B, net_optimizer, net_B_optimizer, index2char, tokenizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return if args.load != None: # nsml.load(checkpoint='saved', session='team47/sr-hack-2019-dataset/' + args.load) nsml.load(checkpoint='model', session='team47/sr-hack-2019-dataset/' + args.load) nsml.save('saved') for g in net_optimizer.param_groups: g['lr'] = 1e-06 for g in net_B_optimizer.param_groups: g['lr'] = 1e-06 for g in net_optimizer.param_groups: logger.info(g['lr']) for g in net_B_optimizer.param_groups: logger.info(g['lr']) wav_paths, script_paths, korean_script_paths = get_paths(DATASET_PATH) logger.info('Korean script path 0: {}'.format(korean_script_paths[0])) logger.info('wav_paths len: {}'.format(len(wav_paths))) logger.info('script_paths len: {}'.format(len(script_paths))) logger.info('korean_script_paths len: {}'.format(len(korean_script_paths))) # Load Korean Scripts korean_script_list, jamo_script_list = get_korean_and_jamo_list_v2( korean_script_paths) logger.info('Korean script 0: {}'.format(korean_script_list[0])) logger.info('Korean script 0 length: {}'.format(len( korean_script_list[0]))) logger.info('Jamo script 0: {}'.format(jamo_script_list[0])) logger.info('Jamo script 0 length: {}'.format(len(jamo_script_list[0]))) script_path_list = get_script_list(script_paths, SOS_token, EOS_token) ground_truth_list = [ (tokenizer.word2num(['<s>'] + list(jamo_script_list[i]) + ['</s>'])) for i in range(len(jamo_script_list)) ] # 90% of the data will be used as train split_index = int(0.95 * len(wav_paths)) wav_path_list_train = wav_paths[:split_index] ground_truth_list_train = ground_truth_list[:split_index] korean_script_list_train = korean_script_list[:split_index] script_path_list_train = script_path_list[:split_index] wav_path_list_eval = wav_paths[split_index:] ground_truth_list_eval = ground_truth_list[split_index:] korean_script_list_eval = korean_script_list[split_index:] script_path_list_eval = script_path_list[split_index:] logger.info('Total:Train:Eval = {}:{}:{}'.format(len(wav_paths), len(wav_path_list_train), len(wav_path_list_eval))) preloader_eval = Threading_Batched_Preloader_v2(wav_path_list_eval, ground_truth_list_eval, script_path_list_eval, korean_script_list_eval, batch_size, num_mels, args.nsc_in_ms, is_train=True) preloader_train = Threading_Batched_Preloader_v2(wav_path_list_train, ground_truth_list_train, script_path_list_train, korean_script_list_train, batch_size, num_mels, args.nsc_in_ms, is_train=False) best_loss = 1e10 best_eval_cer = 1e10 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) logger.info('start') train_begin = time.time() for epoch in range(args.max_epochs): logger.info((datetime.now().strftime('%m-%d %H:%M:%S'))) net.train() net_B.train() preloader_train.initialize_batch(num_thread) loss_list_train = list() seq2seq_loss_list_train = list() seq2seq_loss_list_train_ref = list() logger.info("Initialized Training Preloader") count = 0 total_dist = 0 total_length = 1 total_dist_ref = 0 total_length_ref = 1 while not preloader_train.end_flag: batch = preloader_train.get_batch() # logger.info(psutil.virtual_memory()) # logger.info("Got Batch") if batch is not None: # logger.info("Training Batch is not None") tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch pred_tensor, loss = train(net, net_optimizer, ctc_loss, tensor_input.to(device), ground_truth.to(device), length_list.to(device), device) loss_list_train.append(loss) #################################################### jamo_result = Decode_Prediction_No_Filtering( pred_tensor, tokenizer) true_string_list = Decode_Num_Script( batched_num_script.detach().cpu().numpy(), index2char) for i in range(args.ref_repeat): lev_input_ref = ground_truth lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_train( lev_input_ref.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_optimizer, net_B_criterion) pred_string_list_ref = Decode_Lev_Prediction( lev_pred_ref, index2char) seq2seq_loss_list_train_ref.append(seq2seq_loss_ref) dist_ref, length_ref = char_distance_list( true_string_list, pred_string_list_ref) pred_string_list = [None] dist = 0 length = 0 if (loss < args.loss_lim): lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor) lev_pred, attentions, seq2seq_loss = net_B.net_train( lev_input.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_optimizer, net_B_criterion) pred_string_list = Decode_Lev_Prediction( lev_pred, index2char) seq2seq_loss_list_train.append(seq2seq_loss) dist, length = char_distance_list(true_string_list, pred_string_list) total_dist_ref += dist_ref total_length_ref += length_ref total_dist += dist total_length += length count += 1 if count % 25 == 0: logger.info("Train: Count {} | {} => {}".format( count, true_string_list[0], pred_string_list_ref[0])) logger.info("Train: Count {} | {} => {} => {}".format( count, true_string_list[0], jamo_result[0], pred_string_list[0])) else: logger.info("Training Batch is None") # del preloader_train # logger.info(loss_list_train) train_loss = np.mean(np.asarray(loss_list_train)) train_cer = np.mean(np.asarray(total_dist / total_length)) train_cer_ref = np.mean(np.asarray(total_dist_ref / total_length_ref)) logger.info("Mean Train Loss: {}".format(train_loss)) logger.info("Total Train CER: {}".format(train_cer)) logger.info("Total Train Reference CER: {}".format(train_cer_ref)) preloader_eval.initialize_batch(num_thread) loss_list_eval = list() seq2seq_loss_list_eval = list() seq2seq_loss_list_eval_ref = list() logger.info("Initialized Evaluation Preloader") count = 0 total_dist = 0 total_length = 1 total_dist_ref = 0 total_length_ref = 1 net.eval() net_B.eval() while not preloader_eval.end_flag: batch = preloader_eval.get_batch() if batch is not None: tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch pred_tensor, loss = evaluate(net, ctc_loss, tensor_input.to(device), ground_truth.to(device), length_list.to(device), device) loss_list_eval.append(loss) #################### jamo_result = Decode_Prediction_No_Filtering( pred_tensor, tokenizer) true_string_list = Decode_Num_Script( batched_num_script.detach().cpu().numpy(), index2char) lev_input_ref = ground_truth lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_eval( lev_input_ref.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_criterion) pred_string_list_ref = Decode_Lev_Prediction( lev_pred_ref, index2char) seq2seq_loss_list_train_ref.append(seq2seq_loss_ref) dist_ref, length_ref = char_distance_list( true_string_list, pred_string_list_ref) lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor) lev_pred, attentions, seq2seq_loss = net_B.net_eval( lev_input.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_criterion) pred_string_list = Decode_Lev_Prediction(lev_pred, index2char) seq2seq_loss_list_train.append(seq2seq_loss) dist, length = char_distance_list(true_string_list, pred_string_list) total_dist_ref += dist_ref total_length_ref += length_ref total_dist += dist total_length += length count += 1 #################### if count % 10 == 0: logger.info("Eval: Count {} | {} => {}".format( count, true_string_list[0], pred_string_list_ref[0])) logger.info("Eval: Count {} | {} => {} => {}".format( count, true_string_list[0], jamo_result[0], pred_string_list[0])) else: logger.info("Training Batch is None") eval_cer = total_dist / total_length eval_cer_ref = total_dist_ref / total_length_ref eval_loss = np.mean(np.asarray(loss_list_eval)) logger.info("Mean Evaluation Loss: {}".format(eval_loss)) logger.info("Total Evaluation CER: {}".format(eval_cer)) logger.info("Total Evaluation Reference CER: {}".format(eval_cer_ref)) nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, train_epoch__cer_ref=train_cer_ref, eval__loss=eval_loss, eval__cer=eval_cer, eval__cer_ref=eval_cer_ref) nsml.save(args.save_name) best_model = (eval_cer < best_eval_cer) if best_model: nsml.save('best') best_eval_cer = eval_cer logger.info("Inference Check")
def load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, val_or_test="val", is_pretrain=False, qa_style=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() cached_features_file = "cache_{}".format("dev" if evaluate else "train") # confirm mixing should be applied do_mix = (args.mix_qa and not evaluate) and (is_pretrain and val_or_test == "val") # load from cache if it is possible if val_or_test=="val" and args.load_cache: cached_session = args.cached_session_dev if evaluate else args.cached_session_train if is_pretrain: cached_session = args.cached_session_pretrain if qa_style: cached_session = args.cached_session_pretrain_qa logger.info("Loading features from cached file %s in %s", cached_features_file, cached_session) features_and_datasets = {} def load_data(dir_name): tmp = torch.load(os.path.join(dir_name, '{}.pt'.format(cached_features_file))) print(tmp.keys()) nsml.copy(tmp, features_and_datasets) nsml.bind(load=load_data) nsml.load(checkpoint=cached_features_file, session=cached_session) bind_nsml(model, tokenizer, args) print(features_and_datasets.keys()) features, dataset, examples = ( features_and_datasets["features"], features_and_datasets["dataset"], features_and_datasets["examples"], ) else: logger.info("Creating features from dataset file at %s", cached_features_file) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json" examples = processor.get_eval_examples(args.data_dir, filename=filename) else: if is_pretrain: examples = processor.get_pretrain_examples(args.data_dir, filename=args.train_file, qa_style=qa_style) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) # apply mixing if do_mix: num_qa = len(examples) mix_batch_size = int(args.mix_portion * num_qa) if mix_batch_size % 2 == 1: mix_batch_size -= 1 mix_batch = np.array(random.sample(range(num_qa), mix_batch_size)).reshape(-1, 2) for i, (k,v) in enumerate(mix_batch): example_k, example_v = examples[k], examples[v] ans_k, ans_v = example_k.answer_text, example_v.answer_text example_k.context_text, example_v.context_text = example_v.context_text, example_k.context_text assert not (example_k.is_impossible or example_v.is_impossible) if ans_k != ans_v: example_k.is_impossible, example_v.is_impossible = True, True example_k.start_position_character, example_v.start_position_character = None, None else: example_k.start_position, example_v.end_position = example_v.start_position, example_k.end_position if do_mix or not (val_or_test=="val" and args.load_cache): print("Starting squad_convert_examples_to_features") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) print("Complete squad_convert_examples_to_features") # make cache in the session if it is required if val_or_test=="val" and args.save_cache: features_and_datasets = {"dataset": dataset, "examples": examples, "features": features} def save_data(dir_name): os.makedirs(dir_name, exist_ok=True) torch.save(features_and_datasets, os.path.join(dir_name, '{}.pt'.format(cached_features_file))) logger.info("Save data at {}".format(dir_name)) nsml.bind(save=save_data) nsml.save(cached_features_file) bind_nsml(model, tokenizer, args) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def main(args, local): if args.arch == 'xDeepFM' and args.mode == 'train': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join(DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features] idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스 linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') print('---model defined---') # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까 print(time.time() - s ,'seconds') if use_nsml and args.mode == 'train': bind_nsml(model,[], args.task) if args.mode == 'test': print('_infer root - : ', DATASET_PATH) print('test') model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH) bind_nsml(model, [], args.task) checkpoint_session = ['401','team_62/airush2/176'] nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) print('successfully loaded') if (args.mode == 'train'): if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 nsml.save('infer') print('end') print('end_main') if args.pause: nsml.paused(scope=local)
if mode != 'train': sys.exit(0) # if C.get()['infer_mode'] == 'face': # targets_only = [] # lbs = CustomDataset(TRAIN_DATASET_PATH).targets # for lb_id in range(num_classes): # if lbs.count(lb_id) > 150: # continue # targets_only.append(lb_id) # print(targets_only) if config.transfer: # nsml.load(checkpoint='transfer', session='team_286/4_cls_food/89') nsml.load(checkpoint='100', session='team_286/4_cls_food/103') # cv=1 cutmix 0.5 # nsml.load(checkpoint='55', session='team_286/7_icls_face/2') # nsml.load(checkpoint='transfer', session='team_286/8_iret_food/12') # nsml.load(checkpoint='20', session='team_286/9_iret_car/16') nsml.save('resave') sys.exit(0) tr_loader, val_loader, val_label = data_loader_with_split(root=TRAIN_DATASET_PATH, cv_ratio=config.ratio, cv=config.cv, batch_size=C.get()['batch']) time_ = datetime.datetime.now() best_val_top1 = 0 dataiter = iter(tr_loader) num_steps = 100000 // C.get()['batch'] from pystopwatch2 import PyStopwatch
max_to_keep=cf.keep_checkpoint_max) saver_for_restore.restore(sess, checkpoint_path) saver = tf.train.Saver(tf.global_variables(), max_to_keep=cf.keep_checkpoint_max) num_trained_images = 0 bind_model(saver, sess, images_ph, embeddings_op, cf) if cf.pause: nsml.paused(scope=locals()) bTrainmode = False if cf.mode == 'train': bTrainmode = True if cf.nsml_checkpoint is not None and cf.nsml_session is not None: nsml.load(checkpoint=cf.nsml_checkpoint, session=cf.nsml_session) while True: try: start = time.time() if cf.use_pair_sampling: print("pair sampling") tmp_images, tmp_labels = sess.run([images, labels]) pair_indices = set() single_index_map = {} label_buffer = {} for i, tmp_label in enumerate(tmp_labels): if tmp_label in label_buffer: pair_indices.add(i) pair_indices.add(label_buffer[tmp_label]) if tmp_label in single_index_map:
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=WORD_MAXLEN, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--word', action='store_true', help='Train/Predict model using word based label (default: False)') parser.add_argument('--gen_label_index', action='store_true', help='Generate word label index map(default: False)') parser.add_argument('--iteration', type=str, help='Iteratiom') parser.add_argument('--premodel_session', type=str, help='Session name of premodel') # transformer model parameter parser.add_argument('--d_model', type=int, default=128, help='transformer_d_model') parser.add_argument('--n_head', type=int, default=8, help='transformer_n_head') parser.add_argument('--num_encoder_layers', type=int, default=4, help='num_encoder_layers') parser.add_argument('--num_decoder_layers', type=int, default=4, help='transformer_num_decoder_layers') parser.add_argument('--dim_feedforward', type=int, default=2048, help='transformer_d_model') parser.add_argument('--dropout', type=float, default=0.1, help='transformer_dropout') # transformer warmup parameter parser.add_argument('--warmup_multiplier', type=int, default=3, help='transformer_warmup_multiplier') parser.add_argument('--warmup_epoch', type=int, default=10, help='transformer_warmup_epoch') args = parser.parse_args() char_loader = CharLabelLoader() char_loader.load_char2index('./hackathon.labels') label_loader = char_loader if args.word: if args.gen_label_index: generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH) from subprocess import call call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True) # ??? ??? ??? ?? word_loader = CharLabelLoader() word_loader.load_char2index('./hackathon.pos.labels') label_loader = word_loader if os.path.exists(TRAIN_LABEL_CHAR_PATH): generate_word_label_file(char_loader, word_loader, TRAIN_LABEL_POS_PATH, TRAIN_LABEL_CHAR_PATH) char2index = label_loader.char2index index2char = label_loader.index2char SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') ############ model print("model: transformer") # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers, # dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN) encoder = Encoder(d_input=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, pe_maxlen=SOUND_MAXLEN) decoder = Decoder(sos_id=SOS_token, eos_id=EOS_token, n_tgt_vocab=len(char2index), d_word_vec=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, tgt_emb_prj_weight_sharing=True, pe_maxlen=SOUND_MAXLEN) model = Transformer(encoder, decoder) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), lr=0.0004, betas=(0.9, 0.98), eps=1e-09)) ############/ for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) """ optimizer = optim.Adam(model.module.parameters(), lr=args.lr) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) """ bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o # target_path = os.path.join(DATASET_PATH, 'train_label') target_path = TRAIN_LABEL_CHAR_PATH if args.word: target_path = TRAIN_LABEL_POS_PATH load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) if args.iteration: if args.premodel_session: nsml.load(args.iteration, session=args.premodel_session) logger.info(f'Load {args.premodel_session} {args.iteration}') else: nsml.load(args.iteration) logger.info(f'Load {args.iteration}') logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): # learning rate scheduler train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() print("~~~~~~~~~~~~") if epoch == 10 or (epoch > 48 and epoch % 10 == 9): valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, device, args.max_len, args.batch_size) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
#summary(model,input_size=(3,224,224)) else: model = Baseline(args.hidden_size, args.output_size) optimizer = optim.Adam(model.parameters(), args.learning_rate) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True) criterion = nn.CrossEntropyLoss() #multi-class classification task model = model.to(device) model.train() # DONOTCHANGE: They are reserved for nsml bind_model(model) # below the nsml load nsml.load(checkpoint='15', session='team_62/airush1/40') nsml.save('stillgoing') if args.pause: nsml.paused(scope=locals()) if args.mode == "train": # Warning: Do not load data before this line dataloader = train_dataloader(args.input_size, args.batch_size, args.num_workers) for epoch_idx in range(1, args.epochs + 1): total_loss = 0 total_correct = 0 for batch_idx, (image, tags) in enumerate(dataloader): optimizer.zero_grad() image = image.to(device) #torch.Size([64, 3, 224, 224])
def infer(test_image_data_path, test_meta_data_path): # DONOTCHANGE This Line test_meta_data = pd.read_csv(test_meta_data_path, delimiter=',', header=0) # dropout ratio ensemble0 = [['team_62/airush1/320', '02'], ['team_62/airush1/320', '12'], ['team_62/airush1/320', '22'], ['team_62/airush1/98', '4']] # effi ensemble1 = [['team_62/airush1/415', '03'], ['team_62/airush1/415', '13'], ['team_62/airush1/415', '23'], ['team_62/airush1/415', '33']] # effi ensemble2 = [['team_62/airush1/678', '02'], ['team_62/airush1/678', '12'], ['team_62/airush1/185', '17']] #[['team_62/airush1/185','17']] # resnet50 ensemble3 = [['team_62/airush1/683', '02'], ['team_62/airush1/683', '12']] # oct ['team_62/airush1/409','18'] #ensemble4 = [['team_62/airush1/605','8']] # SKNet # transforms 에서 normalize 반드시 뺄 것 input_size = 224 # you can change this according to your model. batch_size = 512 # you can change this. But when you use 'nsml submit --test' for test infer, there are only 200 number of data. device = 0 w0 = 0.125 w2 = 0.166 w3 = 0.25 predict_list = [] for i in range(4): # ensemble 개수 #print('i th inference') dataloader = DataLoader(AIRushDataset( test_image_data_path, test_meta_data, label_path=None, transform=transforms.Compose([ transforms.Resize((input_size, input_size)), transforms.RandomRotation(20), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])), batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True) # 9:10 결과보고 뺄지 말지 결정 # Let's do ensemble!!! if (i == 0): # 'efficientNet_b0 : ensemble 4 - fold' for j in range(4): model_name = 'efficientnet-b0' model = EfficientNet.from_name(model_name) bind_model(model) nsml.load(checkpoint=str(ensemble0[j][1]), session=str(ensemble0[j][0])) model.to(device) model.eval() predict_output_list = [] with torch.no_grad(): for batch_idx, image in enumerate(dataloader): image = image.to(device) output = model(image).double() output_prob = to_np(F.softmax(output, dim=1)) predict_output_list.append(output_prob * w0) predict_output_list = np.concatenate(predict_output_list, axis=0) predict_list.append(predict_output_list) elif (i == 1): # resnet50 for j in range(3): model = resnext50( num_classes=args.output_size) # 모델에 맞게 수정 bind_model(model) nsml.load(checkpoint=str(ensemble2[j][1]), session=str(ensemble2[j][0])) # 모델에 맞게 수정 model.to(device) model.eval() predict_output_list = [] with torch.no_grad(): for batch_idx, image in enumerate(dataloader): image = image.to(device) output = model(image).double() output_prob = to_np(F.softmax(output, dim=1)) #print(output_prob) predict_output_list.append(output_prob * w2) predict_output_list = np.concatenate(predict_output_list, axis=0) predict_list.append(predict_output_list) #print('resnet model') elif (i == 2): # resnet50 for j in range(2): model = OctResNet( Bottleneck, [3, 4, 6, 3], num_classes=args.output_size) # 모델에 맞게 수정 bind_model(model) nsml.load(checkpoint=str(ensemble3[j][1]), session=str(ensemble3[j][0])) # 모델에 맞게 수정 model.to(device) model.eval() predict_output_list = [] with torch.no_grad(): for batch_idx, image in enumerate(dataloader): image = image.to(device) output = model(image).double() output_prob = to_np(F.softmax(output, dim=1)) #print(output_prob) predict_output_list.append(output_prob * w3) # 수정 predict_output_list = np.concatenate(predict_output_list, axis=0) predict_list.append(predict_output_list) #print('resnet model') # ensemble 추가 # 마지막 SENet 추가 predict_vector = np.argmax(np.sum(predict_list, axis=0), axis=1) return predict_vector # this return type should be a numpy array which has shape of (138343)