def handler(context): print( f'start training with parameters : {Parameters.as_dict()}, context : {context}' ) X_train, y_train, cols_train = train_data_loader(DATALAKE_CHANNEL_ID, DATALAKE_TRAIN_FILE_ID, LABEL_FIELD, INPUT_FIELDS) dtrain = lgb.Dataset(X_train, y_train) if DATALAKE_VAL_FILE_ID: X_val, y_val, _ = train_data_loader(DATALAKE_CHANNEL_ID, DATALAKE_VAL_FILE_ID, LABEL_FIELD, INPUT_FIELDS) else: X_val, y_val = None, None extraction_cb = ModelExtractionCallback() tensorboard_cb = TensorBoardCallback(statistics, writer) tensorboard_cb.set_valid(X_val, y_val, Parameters.IS_CLASSIFICATION, IS_MULTI, Parameters.NUM_CLASS) callbacks = [ extraction_cb, tensorboard_cb, ] lgb.cv(PARAMS, dtrain, nfold=Parameters.NFOLD, early_stopping_rounds=Parameters.EARLY_STOPPING_ROUNDS, verbose_eval=Parameters.VERBOSE_EVAL, stratified=STRATIFIED, callbacks=callbacks, metrics=Parameters.METRIC, seed=Parameters.SEED) models = extraction_cb.raw_boosters for i, model in enumerate(models): model.save_model( os.path.join(ABEJA_TRAINING_RESULT_DIR, f'model_{i}.txt')) di = {**(Parameters.as_dict()), 'cols_train': cols_train} lgb_env = open(os.path.join(ABEJA_TRAINING_RESULT_DIR, 'lgb_env.json'), 'w') json.dump(di, lgb_env) lgb_env.close() writer.close()
def predict(): weight = np.load("./result/weight.npy") tr_data_loader = train_data_loader() te_data_loader = test_data_loader(tr_data_loader.mean, tr_data_loader.std) question = te_data_loader.get_data() # predict pre = np.dot(question, weight) pre = (pre * te_data_loader.std[9]) + te_data_loader.mean[9] for i in range(len(pre)): print("id:", i, pre[i]) # save file with open("./result/predict.csv", "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(["id", "value"]) for i in range(len(pre)): id_name = 'id_' id_name = id_name + str(i) answer = float(pre[i]) if answer < 0: answer = 0 writer.writerow([id_name, answer])
hyper_params = { "num_epochs" : config.num_epochs, "batch_size" : config.batch_size, "learning_rate" : config.learning_rate, "hidden_size" : config.hidden_size, "pretrained" : config.pretrained } # define a path to save experiment logs experiment_path = "./{}".format(config.exp) if not os.path.exists(experiment_path): os.mkdir(experiment_path) #create data loaders train_dataloader = data_loader.train_data_loader() test_dataloader = data_loader.test_data_loader() Model = model.newModel() Model.to(config.device) #define loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adadelta(Model.parameters(), lr = config.learning_rate) def train(dataloader, model, loss_fn, optimizer): size = len(dataloader.dataset) for batch, (X, y) in enumerate(dataloader): X, y = X.to(config.device), y.to(config.device) #compute prediction error
bind_model(model) #model.summary() """ Load data """ print('dataset path', DATASET_PATH) output_path = ['./img_list.pkl', './label_list.pkl'] train_dataset_path = DATASET_PATH + '/train/train_data' if nsml.IS_ON_NSML: # Caching file nsml.cache(train_data_loader, data_path=train_dataset_path, img_size=input_shape[:2], output_path=output_path) else: # local에서 실험할경우 dataset의 local-path 를 입력해주세요. train_data_loader(train_dataset_path, input_shape[:2], output_path=output_path) with open(output_path[0], 'rb') as img_f: img_list = pickle.load(img_f) with open(output_path[1], 'rb') as label_f: label_list = pickle.load(label_f) mean_arr = None # np.zeros(input_shape) #for img in img_list: # mean_arr += img.astype('float32') #mean_arr /= len(img_list) #print('mean shape:',mean_arr.shape, 'mean mean:',mean_arr.mean(), 'mean max:',mean_arr.max()) #mean_arr /= 255 #np.save('./mean.npy', mean_arr)
bTrainmode = False if config.mode == 'train': bTrainmode = True """ Load data """ print(DATASET_PATH) output_path = ['./img_list.pkl', './label_list.pkl'] train_dataset_path = DATASET_PATH + '/train/train_data' if nsml.IS_ON_NSML: # Caching file nsml.cache(train_data_loader, data_path=train_dataset_path, output_path=output_path) else: train_dataset_path = config.debug_data train_data_loader(train_dataset_path, output_path=output_path) with open(output_path[0], 'rb') as img_f: img_list = pickle.load(img_f) with open(output_path[1], 'rb') as label_f: label_list = pickle.load(label_f) queries, references, queries_img, reference_img \ = convert_to_query_db_data_for_generator(img_list, label_list, input_shape, config.dev_querynum, config.dev_referencenum) print("mAP devset : query(%d), reference(%d) " % (len(queries), len(references))) dataset = get_triplet_dataset(train_dataset_path, batch_size, nb_epoch, num_classes=num_classes)
def train(model, train_inp_tuple, validation_inp_tuple, checkpoint_dir, checkpoint_prefix, device, epoches=5, batch_size=1024, logger=None, epoch_start=0, max_seq_len=100, lr=1e-3): """ : model (torch.nn.module): model to be trained : train_inp_tuple (list[tuple(str, list[str], list[str])]): list of input for train_data_loader : str: path to label data : list[str]: list of embedding variables : list[str]: list of paths to a pkl file : validation_inp_tuple (list[tuple(str, list[str], list[str])]): list of input for train_data_loader : str: path to label data : list[str]: list of embedding variables : list[str]: list of paths to a pkl file : checkpoint_dir (str): path to checkpoint directory : checkpoint_prefix (str): prefix of checkpoint file : device (str): device to train the model : epoches (int): number of epoches to train : batch_size (int): size of mini batch : epoch_start (int): if = 0 then train a new model, else load an existing model and continue to train, default 0 : max_seq_len (int): max length for sequence input, default 100 : lr (float): learning rate for Adam, default 1e-3 """ global w2v_registry, model_path gc.enable() # Check checkpoint directory if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # Load model if not train from scratch if epoch_start != 0: model_artifact_path = os.path.join( checkpoint_dir, '{}_{}.pth'.format(checkpoint_prefix, epoch_start)) model.load_state_dict(torch.load(model_artifact_path)) if logger: logger.info('Start retraining from epoch {}'.format(epoch_start)) # Set up loss function and optimizer model.to(device) loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr, amsgrad=True) div, mod = divmod(810000, batch_size) n_batch_estimate = div + min(mod, 1) # Main Loop for epoch in range(1 + epoch_start, epoches + 1 + epoch_start): if logger: logger.info('=========================') logger.info('Processing Epoch {}/{}'.format( epoch, epoches + epoch_start)) logger.info('=========================') # Train model model.train() train_running_loss, train_n_batch = 0, 0 for index, (label_artifact_path, seq_inp_target, seq_inp_path) in enumerate(train_inp_tuple, start=1): train_loader = train_data_loader(label_artifact_path, seq_inp_target, seq_inp_path, w2v_registry, batch_size=batch_size, max_seq_len=max_seq_len) train_iterator = iter(train_loader) while True: try: y, x_seq, x_last_idx = next(train_iterator) y = torch.from_numpy(y).long().to(device) x = [] for s in x_seq: x.append(s.to(device)) x.append(x_last_idx) optimizer.zero_grad() yp = F.softmax(model(*x), dim=1) loss = loss_fn(yp, y) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=100) optimizer.step() train_running_loss += loss.item() train_n_batch += 1 if train_n_batch % 100 == 0 and logger: logger.info( 'Epoch {}/{} - Batch {}/{} Done - Train Loss: {:.6f}' .format(epoch, epoches + epoch_start, train_n_batch, n_batch_estimate, train_running_loss / train_n_batch)) del x, y, yp, x_seq, x_last_idx _ = gc.collect() torch.cuda.empty_cache() except StopIteration: break del train_loader, train_iterator _ = gc.collect() torch.cuda.empty_cache() if logger: logger.info( 'Epoch {}/{} - Batch {}/{} Done - Train Loss: {:.6f}'. format(epoch, epoches + epoch_start, train_n_batch, n_batch_estimate, train_running_loss / train_n_batch)) # Evaluate model model.eval() test_running_loss, test_n_batch = 0, 0 true_y, pred_y = [], [] for index, (label_artifact_path, seq_inp_target, seq_inp_path) in enumerate(validation_inp_tuple, start=1): train_loader = train_data_loader(label_artifact_path, seq_inp_target, seq_inp_path, w2v_registry, batch_size=batch_size, max_seq_len=max_seq_len) train_iterator = iter(train_loader) while True: try: y, x_seq, x_last_idx = next(train_iterator) y = torch.from_numpy(y).long().to(device) x = [] for s in x_seq: x.append(s.to(device)) x.append(x_last_idx) yp = F.softmax(model(*x), dim=1) loss = loss_fn(yp, y) pred_y.extend(list(yp.cpu().detach().numpy())) true_y.extend(list(y.cpu().detach().numpy())) test_running_loss += loss.item() test_n_batch += 1 del x, y, yp, x_seq, x_last_idx _ = gc.collect() torch.cuda.empty_cache() except StopIteration: break del train_loader, train_iterator _ = gc.collect() torch.cuda.empty_cache() pred = np.argmax(np.array(pred_y), 1) true = np.array(true_y).reshape((-1, )) acc_score = accuracy_score(true, pred) del pred, true, pred_y, true_y _ = gc.collect() torch.cuda.empty_cache() if logger: logger.info( 'Epoch {}/{} Done - Test Loss: {:.6f}, Test Accuracy: {:.6f}'. format(epoch, epoches + epoch_start, test_running_loss / test_n_batch, acc_score)) # Save model state dict ck_file_name = '{}_{}.pth'.format(checkpoint_prefix, epoch) ck_file_path = os.path.join(checkpoint_dir, ck_file_name) torch.save(model.state_dict(), ck_file_path)
if config.model_to_test: model = load(file_path=config.model_to_test) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) if config.mode == 'train': #torch.autograd.set_detect_anomaly(True) """ Load data """ print('dataset path', dataset_path) train_dataset_path = dataset_path #+ '/train/train_data' img_dataset = train_data_loader(data_path=train_dataset_path, img_size=input_size, use_augment=use_augmentation) # Balanced batch sampler and online train loader train_batch_sampler = BalancedBatchSampler(img_dataset, n_classes=num_classes, n_samples=num_samples) #train_batch_sampler = NegativeClassMiningBatchSampler(img_dataset, n_classes=num_classes, n_samples=num_samples) online_train_loader = torch.utils.data.DataLoader(img_dataset, batch_sampler=train_batch_sampler, num_workers=4, pin_memory=True) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Gather the parameters to be optimized/updated. params_to_update = model.parameters() print("Params to learn:") if feature_extracting:
def main(): t_data_loader = train_data_loader() question, answer = t_data_loader.load_all_data() train(question, answer, 300000, 1, True)
t2 = time.time() print(res.history) print('Training time for one epoch : %.1f' % ((t2 - t1))) train_loss, train_acc = res.history['loss'][0], res.history['acc'][0] nsml.report(summary=True, epoch=epoch, epoch_total=nb_epoch, loss=train_loss, acc=train_acc) nsml.save(epoch+1) print('Total training time : %.1f' % (time.time() - t0)) """ Test with a subset of training data """ print('dataset path', DATASET_PATH) output_path = ['./img_list.pkl', './label_list.pkl'] train_dataset_path = DATASET_PATH + '/train/train_data' train_data_loader(train_dataset_path, input_shape[:2], output_path=output_path, num_samples=5000) with open(output_path[0], 'rb') as img_f: img_list = pickle.load(img_f) with open(output_path[1], 'rb') as label_f: label_list = pickle.load(label_f) x_train = np.asarray(img_list) labels = np.asarray(label_list) label_binarizer = LabelBinarizer() y_train = label_binarizer.fit_transform(labels) x_train = x_train.astype('float32') # x_train /= 255 x_train = preprocess_input(x_train) print(len(labels), 'validation samples')
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) """ Load data """ print('dataset path', DATASET_PATH) output_path = ['./triplets.txt'] train_dataset_path = DATASET_PATH + '/train/train_data' if nsml.IS_ON_NSML: # Caching file nsml.cache(tripletSampler, data_path=train_dataset_path, img_size=input_shape[:2], output_path=output_path) else: # local에서 실험할경우 dataset의 local-path 를 입력해주세요. train_data_loader('/home/donghoon/Downloads/image-similarity-deep-ranking/dataset', input_shape[:2], output_path=output_path) with open(output_path[0], 'rb') as img_f: img_list = pickle.load(img_f) # x_train = np.asarray(img_list) # labels = np.asarray(label_list) # y_train = keras.utils.to_categorical(labels, num_classes=num_classes) # x_train = x_train.astype('float32') # x_train /= 255 # print(len(labels), 'train samples') # # """ Callback """ # monitor = 'acc' # reduce_lr = ReduceLROnPlateau(monitor=monitor, patience=3)
def main_training(log_tuple, validation_set=0, threshold=0.5, layers=3, lr=1e-2, nb_epoch=5, nb_samples_per_epoch=100, nb_val_samples=20, patience=20, path='models/weights'): best_val_loss = np.inf not_done_looping = True nb_perf_not_improved = 0 demo_dict = {} log_train, log_valid = log_tuple for epoch in range(nb_epoch): print("Epoch: {}/{}".format(epoch + 1, nb_epoch)) if not_done_looping: progbar = Progbar(target=nb_samples_per_epoch) seen = 0 count_train_samples = 0 decay = math.pow(0.5, epoch / 50) lr = lr * decay set_lr(lr) mean_accuracy = 0 mean_val_loss = 0 mean_dice_score = 0 mean_precision = 0 mean_recall = 0 count_valid_samples = 0 no_of_patches_seen = 0 mean_train_loss = 0 mean_train_recall = 0 mean_train_precision = 0 mean_train_dice_score = 0 for X_train, Y_train, weights in train_data_loader( train_batch_size, combine_label): if count_train_samples == nb_samples_per_epoch: break if seen < nb_samples_per_epoch: log_values = [] xs = X_train.shape[2] ys = Y_train.shape[3] Y_train = Y_train.reshape((train_batch_size * xs * ys, )) weights = weights.reshape((train_batch_size * xs * ys, )) train_loss = train_fn(X_train.astype('float32'), Y_train.astype('int32'), weights.astype('float32')) Y_pred = predict_fn(X_train.astype('float32')) Y_pred_class = np.argmax(Y_pred, axis=1) dice_score = get_dice_score(Y_train, Y_pred_class) mean_train_loss += train_loss mean_train_dice_score += dice_score count_train_samples += X_train.shape[0] seen += X_train.shape[0] log_values.append(('train_loss', train_loss)) if seen < nb_samples_per_epoch: progbar.update(seen, log_values) log_values.append(('train_loss', train_loss)) progbar.update(seen, log_values, force=True) mean_train_loss = mean_train_loss / (nb_samples_per_epoch / train_batch_size) mean_train_dice_score = mean_train_dice_score / ( nb_samples_per_epoch / train_batch_size) log_train.post('train_loss', mean_train_loss, epoch) log_train.post("mean_train_dice_score", mean_train_dice_score, epoch) if epoch % 5 == 0: validation_start = time.time() count_valid_samples = 0 for X_valid, Y_valid in valid_data_loader( nb_val_samples, valid_batch_size, combine_label): xs = X_valid.shape[2] ys = Y_valid.shape[3] Y_valid = Y_valid.reshape((valid_batch_size * xs * ys, )) Y_pred = test_predict_fn(X_valid.astype('float32')) val_loss = loss( Y_pred.astype('float32'), Y_valid.astype('int32'), np.ones( (Y_valid.shape[0], )).astype('float32')).eval() Y_pred_class = np.argmax(Y_pred, axis=1) dice_score = get_dice_score(Y_valid, Y_pred_class) Y_pred = Y_pred_class.reshape(valid_batch_size, 1, xs, ys) Y_valid = Y_valid.reshape(valid_batch_size, 1, xs, ys) save_image_path = os.path.join( save_path, str(epoch), '{}.png'.format(count_valid_samples)) if not os.path.exists(os.path.join(save_path, str(epoch))): os.makedirs(os.path.join(save_path, str(epoch))) vis_detections(X_valid[5][0], Y_valid[5][0], Y_pred[5][0], save_image_path) mean_val_loss += val_loss mean_dice_score += dice_score count_valid_samples += 1 mean_val_loss = mean_val_loss / (nb_val_samples / valid_batch_size) mean_dice_score = mean_dice_score / (nb_val_samples / valid_batch_size) print(mean_val_loss, mean_dice_score) log_valid.post("val_loss", mean_val_loss, epoch) log_valid.post("mean_val_dice_score", mean_dice_score, epoch) print("mean_val_loss: {} , mean_dice_score: {}".format( mean_val_loss, mean_dice_score)) validation_end = time.time() validation_time = validation_end - validation_start print('validation time : %ds' % validation_time) if mean_val_loss < best_val_loss: best_val_loss = mean_val_loss best_epoch = epoch nb_perf_not_improved = 0 dpath = os.path.join( path, "Unet_vald_set_{}_val_loss_{}_epoch_{}".format( validation_set, best_val_loss, best_epoch)) save_params(dpath) else: nb_perf_not_improved += 1 if nb_perf_not_improved > patience: print( "Exiting training as performance not improving for {} loops" .format(patience)) not_done_looping = False return best_val_loss, best_epoch
STRATIFIED = Parameters.STRATIFIED and Parameters.IS_CLASSIFICATION IS_MULTI = Parameters.OBJECTIVE.startswith("multi") statistics = Statistics(Parameters.NUM_ITERATIONS) log_path = os.path.join(ABEJA_TRAINING_RESULT_DIR, 'logs') writer = SummaryWriter(log_dir=log_path) # In[4]: print(f'start training with parameters : {Parameters.as_dict()}') # In[5]: X_train, y_train, cols_train = train_data_loader(DATALAKE_CHANNEL_ID, DATALAKE_TRAIN_FILE_ID, LABEL_FIELD, INPUT_FIELDS) # In[8]: dtrain = lgb.Dataset(X_train, y_train) if DATALAKE_VAL_FILE_ID: X_val, y_val, _ = train_data_loader(DATALAKE_CHANNEL_ID, DATALAKE_VAL_FILE_ID, LABEL_FIELD, INPUT_FIELDS) else: X_val, y_val = None, None extraction_cb = ModelExtractionCallback() tensorboard_cb = TensorBoardCallback(statistics, writer)
def handler(context): print( f'start training with parameters : {Parameters.as_dict()}, context : {context}' ) X_train, y_train, cols_train = train_data_loader(DATALAKE_CHANNEL_ID, DATALAKE_TRAIN_FILE_ID, LABEL_FIELD, INPUT_FIELDS) models = [] pred = np.zeros(len(X_train)) if DATALAKE_VAL_FILE_ID: X_val, y_val, _ = train_data_loader(DATALAKE_CHANNEL_ID, DATALAKE_VAL_FILE_ID, LABEL_FIELD, INPUT_FIELDS) if IS_MULTI: pred_val = np.zeros((len(X_val), NUM_CLASS)) else: pred_val = np.zeros(len(X_val)) else: X_val, y_val, pred_val = None, None, None for i, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)): model = classifier(**PARAMS) model.fit(X_train.iloc[train_index], y_train[train_index]) pred[valid_index] = model.predict(X_train.iloc[valid_index]) score, loss = evaluator(y_train[valid_index], pred[valid_index]) score_val = 0.0 loss_val = 0.0 filename = os.path.join(ABEJA_TRAINING_RESULT_DIR, f'model_{i}.pkl') pickle.dump(model, open(filename, 'wb')) models.append(model) if DATALAKE_VAL_FILE_ID: pred_val_cv = model.predict(X_val) if IS_MULTI: pred_val += np.identity(NUM_CLASS)[pred_val_cv] else: pred_val += pred_val_cv score_val, loss_val = evaluator(y_val, pred_val_cv) print('-------------') print( 'cv {} || score:{:.4f} || loss:{:.4f} || val_score:{:.4f} || val_loss:{:.4f}' .format(i + 1, score, loss, score_val, loss_val)) writer.add_scalar('main/acc', score, i + 1) writer.add_scalar('main/loss', loss, i + 1) writer.add_scalar('test/acc', score_val, i + 1) writer.add_scalar('test/loss', loss_val, i + 1) statistics(i + 1, loss, score, loss_val, score_val) writer.flush() score, loss = evaluator(y_train, pred) score_val = 0.0 loss_val = 0.0 if DATALAKE_VAL_FILE_ID: if IS_MULTI: pred_val = np.argmax(pred_val, axis=1) else: pred_val /= len(models) score_val, loss_val = evaluator(y_val, pred_val) print('-------------') print( 'cv total score:{:.4f} || cv total loss:{:.4f} || cv total val_score:{:.4f} || cv total val_loss:{:.4f}' .format(score, loss, score_val, loss_val)) statistics(Parameters.NFOLD, None, score, None, score_val) writer.add_scalar('main/acc', score, Parameters.NFOLD) writer.add_scalar('main/loss', loss, Parameters.NFOLD) writer.add_scalar('test/acc', score_val, Parameters.NFOLD) writer.add_scalar('test/loss', loss_val, Parameters.NFOLD) writer.close() di = {**(Parameters.as_dict()), 'cols_train': cols_train} skf_env = open(os.path.join(ABEJA_TRAINING_RESULT_DIR, 'skf_env.json'), 'w') json.dump(di, skf_env) skf_env.close() return
from sklearn import metrics from matplotlib import pyplot as plt from model import Model from hyperparams import Hyperparams from data_loader import train_data_loader, test_data_loader, prediction_dataframe, hist_data, untransformed_price #logger configuration FORMAT = "[%(filename)s: %(lineno)3s] %(levelname)s: %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) logger = logging.getLogger(__name__) H = Hyperparams() train_batch_generator = train_data_loader(H.train_batch_size, H.num_train) test_batch_generator = test_data_loader(H.test_batch_size, H.num_train) prediction_dataframe_gen = prediction_dataframe() scaler = prediction_dataframe_gen.get_scaler() logger.info("Generators instantiated") model = Model().get_model() logger.info("Model loaded") model.compile(optimizer='RMSProp', loss='mean_squared_error') logger.info("Model compiled") logger.info("Beginning training") train_num_batch = H.num_train//H.train_batch_size train_shuffled_batch = np.array([np.random.choice(train_num_batch, size=(train_num_batch), replace=False) for _ in range(H.num_epochs)])