def select_train_data(filename, methods=['POST']): global train_data try: with open("./datasets/" + filename['filename'], 'r') as file: train_data = TrainData() train_data.from_metadata(json.load(file)) socketio.emit('selected_train_data', 1) except Exception as ex: print(ex) socketio.emit('selected_train_data', 0)
def main(): learning_rate = 5e-2 num_epoch = 20 save_dir = 'models' h5file = '/home/song/workspace/datasets/recog-alzheimer/train/train_pre_data.h5' csvfile = '/home/song/workspace/datasets/recog-alzheimer/train/train_pre_label.csv' dataset = TrainData(h5file, csvfile, img_size=48) train_loader = DataLoader(dataset, batch_size=5, shuffle=True) solver = Solver(learning_rate, num_epoch, save_dir) solver.train(train_loader)
def traintest(): if not os.path.isfile(train_data_pickle): # training data train_features, train_labels = get_features(['train_data'], "train") traindata = TrainData(train_features, train_labels) with open(train_data_pickle, mode='wb') as f: pickle.dump(traindata, f) else: print("loading: %s" % (train_data_pickle)) with open(train_data_pickle, mode='rb') as f: traindata = pickle.load(f) train_features = traindata.train_inputs train_labels = traindata.train_targets if not os.path.isfile(test_data_pickle): # testing data test_features, _ = get_features(['test_data'], "test") testdata = TestData(test_features) with open(test_data_pickle, mode='wb') as f: pickle.dump(testdata, f) else: print("loading: %s" % (test_data_pickle)) with open(test_data_pickle, mode='rb') as f: testdata = pickle.load(f) test_features = testdata.test_inputs train_labels = one_hot_encode(train_labels) n_dim = train_features.shape[1] print("input dim: %s" % (n_dim)) # random train and test sets. ''' train_test_split = np.random.rand(len(train_features)) < 0.80 Xtr = train_features[train_test_split] Ytr = train_labels[train_test_split] Xte = train_features[~train_test_split] Yte = train_labels[~train_test_split] ''' Xtr = train_features Ytr = train_labels Xte = test_features knn(n_dim, Xtr, Ytr, Xte)
def add_train(): status = 100 # = fail try: content = request.files['file'] # get the file name without its extension filename = (content.filename).replace(".json", "") content = content.read().decode('utf-8') content = json.loads(content) global train_data train_data = TrainData(filename) if not train_data.filter_json(content) : message = "incorrect data structure (1)" print(message) return make_response(jsonify({"message" : message}), status) if not train_data.is_correct() : message = "incorrect data structure (2)" print(message) return make_response(jsonify({"message" : message}), status) if not train_data.metadata() : message = "failed to create metadata" print(message) return make_response(jsonify({"message" : message}), status) # save metafile train_data.create_metafile() status = 200 return make_response(jsonify({"message" : "JSON received"}), status) #200 = success except Exception as ex: print(ex) return make_response(jsonify({"message" : str(ex)}), status)
def main(): input_dir = "/amit/kaggle/tgs" output_dir = "/artifacts" image_size_target = 128 batch_size = 32 epochs_to_train = 300 bce_loss_weight_gamma = 0.98 sgdr_min_lr = 0.0001 # 0.0001, 0.001 sgdr_max_lr = 0.001 # 0.001, 0.03 sgdr_cycle_epochs = 20 sgdr_cycle_epoch_prolongation = 3 sgdr_cycle_end_patience = 3 train_abort_epochs_without_improval = 30 ensemble_model_count = 3 swa_epoch_to_start = 30 model_dir = sys.argv[1] if len(sys.argv) > 1 else None train_data = TrainData(input_dir) train_set = TrainDataset(train_data.train_set_df, image_size_target, augment=True) train_set_data_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=8) val_set = TrainDataset(train_data.val_set_df, image_size_target, augment=False) val_set_data_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=2) if model_dir: model = create_model(pretrained=False).to(device) model.load_state_dict(torch.load("{}/model.pth".format(model_dir), map_location=device)) else: model = create_model(pretrained=True).to(device) torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) swa_model = create_model(pretrained=False).to(device) print("train_set_samples: %d, val_set_samples: %d" % (len(train_set), len(val_set))) global_val_precision_best_avg = float("-inf") global_swa_val_precision_best_avg = float("-inf") sgdr_cycle_val_precision_best_avg = float("-inf") epoch_iterations = len(train_set) // batch_size # optimizer = optim.SGD(model.parameters(), lr=sgdr_max_lr, weight_decay=0, momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=sgdr_max_lr) lr_scheduler = CosineAnnealingLR(optimizer, T_max=sgdr_cycle_epochs, eta_min=sgdr_min_lr) optim_summary_writer = SummaryWriter(log_dir="{}/logs/optim".format(output_dir)) train_summary_writer = SummaryWriter(log_dir="{}/logs/train".format(output_dir)) val_summary_writer = SummaryWriter(log_dir="{}/logs/val".format(output_dir)) swa_val_summary_writer = SummaryWriter(log_dir="{}/logs/swa_val".format(output_dir)) sgdr_iterations = 0 sgdr_reset_count = 0 batch_count = 0 epoch_of_last_improval = 0 sgdr_next_cycle_end_epoch = sgdr_cycle_epochs + sgdr_cycle_epoch_prolongation swa_update_count = 0 ensemble_model_index = 0 for model_file_path in glob.glob("{}/model-*.pth".format(output_dir)): model_file_name = os.path.basename(model_file_path) model_index = int(model_file_name.replace("model-", "").replace(".pth", "")) ensemble_model_index = max(ensemble_model_index, model_index + 1) print('{"chart": "best_val_precision", "axis": "epoch"}') print('{"chart": "val_precision", "axis": "epoch"}') print('{"chart": "val_loss", "axis": "epoch"}') print('{"chart": "sgdr_reset", "axis": "epoch"}') print('{"chart": "precision", "axis": "epoch"}') print('{"chart": "loss", "axis": "epoch"}') print('{"chart": "swa_val_precision", "axis": "epoch"}') print('{"chart": "swa_val_loss", "axis": "epoch"}') train_start_time = time.time() criterion = nn.BCEWithLogitsLoss() for epoch in range(epochs_to_train): epoch_start_time = time.time() model.train() train_loss_sum = 0.0 train_precision_sum = 0.0 train_step_count = 0 for batch in train_set_data_loader: images, masks, mask_weights = \ batch[0].to(device, non_blocking=True), \ batch[1].to(device, non_blocking=True), \ batch[2].to(device, non_blocking=True) lr_scheduler.step(epoch=min(sgdr_cycle_epochs, sgdr_iterations / epoch_iterations)) optimizer.zero_grad() prediction_logits = model(images) predictions = torch.sigmoid(prediction_logits) criterion.weight = mask_weights loss = criterion(prediction_logits, masks) loss.backward() optimizer.step() train_loss_sum += loss.item() train_precision_sum += np.mean(precision_batch(predictions, masks)) sgdr_iterations += 1 train_step_count += 1 batch_count += 1 optim_summary_writer.add_scalar("lr", get_learning_rate(optimizer), batch_count + 1) train_loss_avg = train_loss_sum / train_step_count train_precision_avg = train_precision_sum / train_step_count val_loss_avg, val_precision_avg = evaluate(model, val_set_data_loader, criterion) model_improved_within_sgdr_cycle = val_precision_avg > sgdr_cycle_val_precision_best_avg if model_improved_within_sgdr_cycle: torch.save(model.state_dict(), "{}/model-{}.pth".format(output_dir, ensemble_model_index)) sgdr_cycle_val_precision_best_avg = val_precision_avg model_improved = val_precision_avg > global_val_precision_best_avg ckpt_saved = False if model_improved: torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) global_val_precision_best_avg = val_precision_avg ckpt_saved = True swa_model_improved = False if epoch + 1 >= swa_epoch_to_start: if model_improved_within_sgdr_cycle: swa_update_count += 1 moving_average(swa_model, model, 1.0 / swa_update_count) bn_update(train_set_data_loader, swa_model) swa_model_improved = val_precision_avg > global_swa_val_precision_best_avg if swa_model_improved: torch.save(swa_model.state_dict(), "{}/swa_model.pth".format(output_dir)) global_swa_val_precision_best_avg = val_precision_avg if model_improved or swa_model_improved: epoch_of_last_improval = epoch sgdr_reset = False if (epoch + 1 >= sgdr_next_cycle_end_epoch) and (epoch - epoch_of_last_improval >= sgdr_cycle_end_patience): sgdr_iterations = 0 sgdr_next_cycle_end_epoch = epoch + 1 + sgdr_cycle_epochs + sgdr_cycle_epoch_prolongation ensemble_model_index += 1 sgdr_cycle_val_precision_best_avg = float("-inf") sgdr_reset_count += 1 sgdr_reset = True swa_val_loss_avg, swa_val_precision_avg = evaluate(swa_model, val_set_data_loader, criterion) optim_summary_writer.add_scalar("sgdr_reset", sgdr_reset_count, epoch + 1) train_summary_writer.add_scalar("loss", train_loss_avg, epoch + 1) train_summary_writer.add_scalar("precision", train_precision_avg, epoch + 1) val_summary_writer.add_scalar("loss", val_loss_avg, epoch + 1) val_summary_writer.add_scalar("precision", val_precision_avg, epoch + 1) swa_val_summary_writer.add_scalar("loss", swa_val_loss_avg, epoch + 1) swa_val_summary_writer.add_scalar("precision", swa_val_precision_avg, epoch + 1) epoch_end_time = time.time() epoch_duration_time = epoch_end_time - epoch_start_time print( "[%03d/%03d] %ds, lr: %.6f, loss: %.3f, val_loss: %.3f|%.3f, prec: %.3f, val_prec: %.3f|%.3f, ckpt: %d, rst: %d" % ( epoch + 1, epochs_to_train, epoch_duration_time, get_learning_rate(optimizer), train_loss_avg, val_loss_avg, swa_val_loss_avg, train_precision_avg, val_precision_avg, swa_val_precision_avg, int(ckpt_saved), int(sgdr_reset)), flush=True) print('{"chart": "best_val_precision", "x": %d, "y": %.3f}' % (epoch + 1, global_val_precision_best_avg)) print('{"chart": "val_precision", "x": %d, "y": %.3f}' % (epoch + 1, val_precision_avg)) print('{"chart": "val_loss", "x": %d, "y": %.3f}' % (epoch + 1, val_loss_avg)) print('{"chart": "sgdr_reset", "x": %d, "y": %.3f}' % (epoch + 1, sgdr_reset_count)) print('{"chart": "precision", "x": %d, "y": %.3f}' % (epoch + 1, train_precision_avg)) print('{"chart": "loss", "x": %d, "y": %.3f}' % (epoch + 1, train_loss_avg)) print('{"chart": "swa_val_precision", "x": %d, "y": %.3f}' % (epoch + 1, swa_val_precision_avg)) print('{"chart": "swa_val_loss", "x": %d, "y": %.3f}' % (epoch + 1, swa_val_loss_avg)) if sgdr_reset and sgdr_reset_count >= ensemble_model_count and epoch - epoch_of_last_improval >= train_abort_epochs_without_improval: print("early abort") break optim_summary_writer.close() train_summary_writer.close() val_summary_writer.close() train_end_time = time.time() print() print("Train time: %s" % str(datetime.timedelta(seconds=train_end_time - train_start_time))) eval_start_time = time.time() print() print("evaluation of the training model") model.load_state_dict(torch.load("{}/model.pth".format(output_dir), map_location=device)) analyze(Ensemble([model]), train_data.val_set_df, use_tta=False) analyze(Ensemble([model]), train_data.val_set_df, use_tta=True) score_to_model = {} ensemble_model_candidates = glob.glob("{}/model-*.pth".format(output_dir)) ensemble_model_candidates.append("{}/swa_model.pth".format(output_dir)) for model_file_path in ensemble_model_candidates: model_file_name = os.path.basename(model_file_path) m = create_model(pretrained=False).to(device) m.load_state_dict(torch.load(model_file_path, map_location=device)) val_loss_avg, val_precision_avg = evaluate(m, val_set_data_loader, criterion) print("ensemble '%s': val_loss=%.3f, val_precision=%.3f" % (model_file_name, val_loss_avg, val_precision_avg)) if len(score_to_model) < ensemble_model_count or min(score_to_model.keys()) < val_precision_avg: del score_to_model[min(score_to_model.keys())] score_to_model[val_precision_avg] = m ensemble_models = list(score_to_model.values()) for ensemble_model in ensemble_models: val_loss_avg, val_precision_avg = evaluate(ensemble_model, val_set_data_loader, criterion) print("ensemble: val_loss=%.3f, val_precision=%.3f" % (val_loss_avg, val_precision_avg)) model = Ensemble(ensemble_models) mask_threshold_global, mask_threshold_per_cc = analyze(model, train_data.val_set_df, use_tta=True) eval_end_time = time.time() print() print("Eval time: %s" % str(datetime.timedelta(seconds=eval_end_time - eval_start_time))) print() print("submission preparation") submission_start_time = time.time() test_data = TestData(input_dir) calculate_predictions(test_data.df, model, use_tta=True) calculate_prediction_masks(test_data.df, mask_threshold_global) print() print(test_data.df.groupby("predictions_cc").agg({"predictions_cc": "count"})) write_submission(test_data.df, "prediction_masks", "{}/{}".format(output_dir, "submission.csv")) write_submission(test_data.df, "prediction_masks_best", "{}/{}".format(output_dir, "submission_best.csv")) submission_end_time = time.time() print() print("Submission time: %s" % str(datetime.timedelta(seconds=submission_end_time - submission_start_time)))
def main(): args = argparser.parse_args() print("Arguments:") for arg in vars(args): print(" {}: {}".format(arg, getattr(args, arg))) print() input_dir = args.input_dir output_dir = args.output_dir base_model_dir = args.base_model_dir image_size = args.image_size augment = args.augment use_dummy_image = args.use_dummy_image use_progressive_image_sizes = args.use_progressive_image_sizes progressive_image_size_min = args.progressive_image_size_min progressive_image_size_step = args.progressive_image_size_step progressive_image_epoch_step = args.progressive_image_epoch_step batch_size = args.batch_size batch_iterations = args.batch_iterations test_size = args.test_size train_on_unrecognized = args.train_on_unrecognized num_category_shards = args.num_category_shards category_shard = args.category_shard eval_train_mapk = args.eval_train_mapk mapk_topk = args.mapk_topk num_shard_preload = args.num_shard_preload num_shard_loaders = args.num_shard_loaders num_workers = args.num_workers pin_memory = args.pin_memory epochs_to_train = args.epochs lr_scheduler_type = args.lr_scheduler lr_patience = args.lr_patience lr_min = args.lr_min lr_max = args.lr_max lr_min_decay = args.lr_min_decay lr_max_decay = args.lr_max_decay optimizer_type = args.optimizer loss_type = args.loss loss2_type = args.loss2 loss2_start_sgdr_cycle = args.loss2_start_sgdr_cycle model_type = args.model patience = args.patience sgdr_cycle_epochs = args.sgdr_cycle_epochs sgdr_cycle_epochs_mult = args.sgdr_cycle_epochs_mult sgdr_cycle_end_prolongation = args.sgdr_cycle_end_prolongation sgdr_cycle_end_patience = args.sgdr_cycle_end_patience max_sgdr_cycles = args.max_sgdr_cycles use_extended_stroke_channels = model_type in [ "cnn", "residual_cnn", "fc_cnn", "hc_fc_cnn" ] train_data = TrainData(data_dir=input_dir, shard=0, test_size=test_size, train_on_unrecognized=train_on_unrecognized, confusion_set=None, num_category_shards=num_category_shards, category_shard=category_shard) val_set = TrainDataset(train_data.val_set_df, image_size, use_extended_stroke_channels, False, use_dummy_image) val_set_data_loader = \ DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) categories = train_data.categories criterion = create_criterion(loss_type, len(categories)) model_dir = "/storage/models/quickdraw/seresnext50" model = load_ensemble_model(model_dir, 3, val_set_data_loader, criterion, model_type, image_size, len(categories)) cs_entry_categories = [ 'angel', 'arm', 'bat', 'bathtub', 'bottlecap', 'hospital', 'police car', 'spider', 'sun', 'tent', 'triangle', 'windmill' ] cs_categories = read_confusion_set( "/storage/models/quickdraw/seresnext50_confusion/confusion_set_{}.txt". format(0)) predicted_words = predict(model, val_set_data_loader, categories, tta=True) prediction_mask = [] cs_entry_match_count = 0 cs_match_count = 0 for i, p in enumerate(predicted_words): predicted_word = p.split(" ")[0].replace("_", " ") cond1 = predicted_word in cs_entry_categories prediction_mask.append(cond1) if cond1 and categories[train_data.val_set_df["category"] [i]] in cs_entry_categories: cs_entry_match_count += 1 if cond1 and categories[train_data.val_set_df["category"] [i]] in cs_categories: cs_match_count += 1 print("matched {} of {}".format(sum(prediction_mask), len(prediction_mask)), flush=True) print("cs_entry_match_count: {}".format(cs_entry_match_count), flush=True) print("cs_match_count: {}".format(cs_match_count), flush=True) df = { "category": train_data.val_set_df["category"][prediction_mask], "drawing": train_data.val_set_df["drawing"][prediction_mask] } val_set = TrainDataset(df, image_size, use_extended_stroke_channels, False, use_dummy_image) val_set_data_loader = \ DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) loss_avg, mapk_avg, accuracy_top1_avg, accuracy_top3_avg, accuracy_top5_avg, accuracy_top10_avg = \ evaluate(model, val_set_data_loader, criterion, mapk_topk) print( "loss: {:.3f}, map@3: {:.3f}, acc@1: {:.3f}, acc@3: {:.3f}, acc@5: {:.3f}, acc@10: {:.3f}" .format(loss_avg, mapk_avg, accuracy_top1_avg, accuracy_top3_avg, accuracy_top5_avg, accuracy_top10_avg), flush=True) predicted_words = predict(model, val_set_data_loader, categories, tta=True) match_count = 0 for i, p in enumerate(predicted_words): predicted_word = p.split(" ")[0].replace("_", " ") true_word = categories[df["category"][i]] if predicted_word == true_word: match_count += 1 if predicted_word not in cs_entry_categories: print("predicted unexpected word: '{}'".format(predicted_word), flush=True) print("acc@1: {}".format(match_count / len(predicted_words)), flush=True) criterion = create_criterion(loss_type, len(cs_categories)) model_dir = "/storage/models/quickdraw/seresnext50_cs_0" model = load_ensemble_model(model_dir, 3, val_set_data_loader, criterion, "seresnext50_cs", image_size, len(cs_categories)) predicted_words = predict(model, val_set_data_loader, cs_categories, tta=True) match_count = 0 for i, p in enumerate(predicted_words): predicted_word = p.split(" ")[0].replace("_", " ") true_word = categories[df["category"][i]] if predicted_word == true_word: match_count += 1 print("acc@1: {}".format(match_count / len(predicted_words)), flush=True)
def create_datasets(args): train_data = TrainData(args.train_path) dev_data = ValidData(args.validation_path) return dev_data, train_data
def train(): if not os.path.isfile(train_data_pickle): # trainig data train_features, train_labels = features(['fold0', 'fold1', 'fold2']) traindata = TrainData(train_features, train_labels) with open(train_data_pickle, mode='wb') as f: pickle.dump(traindata, f) else: print("loading: %s" % (train_data_pickle)) with open(train_data_pickle, mode='rb') as f: traindata = pickle.load(f) train_features = traindata.train_inputs train_labels = traindata.train_targets if not os.path.isfile(test_data_pickle): test_features, test_labels = features(['fold3']) testdata = TestData(test_features, test_labels) with open(test_data_pickle, mode='wb') as f: pickle.dump(testdata, f) else: print("loading: %s" % (test_data_pickle)) with open(test_data_pickle, mode='rb') as f: testdata = pickle.load(f) test_features = testdata.test_inputs test_labels = testdata.test_targets # TODO change to use train and test train_labels = one_hot_encode(train_labels) test_labels = one_hot_encode(test_labels) # random train and test sets. train_test_split = np.random.rand(len(train_features)) < 0.70 train_x = train_features[train_test_split] train_y = train_labels[train_test_split] test_x = train_features[~train_test_split] test_y = train_labels[~train_test_split] n_dim = train_features.shape[1] print("input dim: %s" % (n_dim)) # create placeholder X = tf.placeholder(tf.float32, [None, n_dim]) Y = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) # build graph logits = model.inference(X, n_dim) weights = tf.all_variables() saver = tf.train.Saver(weights) # create loss loss = model.loss(logits, Y) tf.scalar_summary('loss', loss) accracy = model.accuracy(logits, Y) tf.scalar_summary('test accuracy', accracy) # train operation train_op = model.train_op(loss) # variable initializer init = tf.initialize_all_variables() # get Session sess = tf.Session() # sumary merge and writer merged = tf.merge_all_summaries() train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir) # initialize sess.run(init) for step in xrange(MAX_STEPS): t_pred = sess.run(tf.argmax(logits, 1), feed_dict={X: train_features}) t_true = sess.run(tf.argmax(train_labels, 1)) print("train samples pred: %s" % t_pred[:30]) print("train samples target: %s" % t_true[:30]) print('Train accuracy: ', sess.run(accracy, feed_dict={ X: train_x, Y: train_y })) for epoch in xrange(training_epochs): summary, logits_val, _, loss_val = sess.run( [merged, logits, train_op, loss], feed_dict={ X: train_x, Y: train_y }) train_writer.add_summary(summary, step) print("step:%d, loss: %s" % (step, loss_val)) y_pred = sess.run(tf.argmax(logits, 1), feed_dict={X: test_x}) y_true = sess.run(tf.argmax(test_y, 1)) print("test samples pred: %s" % y_pred[:10]) print("test samples target: %s" % y_true[:10]) accracy_val = sess.run([accracy], feed_dict={X: test_x, Y: test_y}) # print('Test accuracy: ', accracy_val) # train_writer.add_summary(accracy_val, step) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average='micro') print("F-score: %s" % f) if step % 1000 == 0: saver.save(sess, FLAGS.ckpt_dir, global_step=step)
def main(): args = argparser.parse_args() log_args(args) input_dir = args.input_dir output_dir = args.output_dir base_model_dir = args.base_model_dir image_size = args.image_size crop_images = args.crop_images augment = args.augment use_progressive_image_sizes = args.use_progressive_image_sizes progressive_image_size_min = args.progressive_image_size_min progressive_image_size_step = args.progressive_image_size_step progressive_image_epoch_step = args.progressive_image_epoch_step batch_size = args.batch_size batch_iterations = args.batch_iterations num_workers = args.num_workers pin_memory = args.pin_memory epochs_to_train = args.epochs lr_scheduler_type = args.lr_scheduler lr_patience = args.lr_patience lr_min = args.lr_min lr_max = args.lr_max lr_min_decay = args.lr_min_decay lr_max_decay = args.lr_max_decay optimizer_type = args.optimizer loss_type = args.loss focal_loss_gamma = args.focal_loss_gamma use_class_weights = args.use_class_weights use_weighted_sampling = args.use_weighted_sampling model_type = args.model patience = args.patience sgdr_cycle_epochs = args.sgdr_cycle_epochs sgdr_cycle_epochs_mult = args.sgdr_cycle_epochs_mult sgdr_cycle_end_prolongation = args.sgdr_cycle_end_prolongation sgdr_cycle_end_patience = args.sgdr_cycle_end_patience max_sgdr_cycles = args.max_sgdr_cycles if optimizer_type == "adam": lr_scheduler_type = "adam" progressive_image_sizes = list( range(progressive_image_size_min, image_size + 1, progressive_image_size_step)) train_data = TrainData(input_dir) train_set = TrainDataset(train_data.train_set_df, input_dir, 28, image_size, crop_images, augment) balance_weights, balance_class_weights = calculate_balance_weights( train_data.df, train_data.train_set_df, 28) train_set_sampler = WeightedRandomSampler(balance_weights, len(balance_weights)) train_set_data_loader = DataLoader( train_set, batch_size=batch_size, shuffle=False if use_weighted_sampling else True, sampler=train_set_sampler if use_weighted_sampling else None, num_workers=num_workers, pin_memory=pin_memory) val_set = TrainDataset(train_data.val_set_df, input_dir, 28, image_size, crop_images, False) val_set_data_loader = \ DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) if base_model_dir: for base_file_path in glob.glob("{}/*.pth".format(base_model_dir)): shutil.copyfile( base_file_path, "{}/{}".format(output_dir, os.path.basename(base_file_path))) model = create_model(type=model_type, num_classes=28).to(device) model.load_state_dict( torch.load("{}/model.pth".format(output_dir), map_location=device)) optimizer = create_optimizer(optimizer_type, model, lr_max) if os.path.isfile("{}/optimizer.pth".format(output_dir)): try: optimizer.load_state_dict( torch.load("{}/optimizer.pth".format(output_dir))) adjust_initial_learning_rate(optimizer, lr_max) adjust_learning_rate(optimizer, lr_max) except: log("Failed to load the optimizer weights") else: model = create_model(type=model_type, num_classes=28).to(device) optimizer = create_optimizer(optimizer_type, model, lr_max) torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) ensemble_model_index = 0 for model_file_path in glob.glob("{}/model-*.pth".format(output_dir)): model_file_name = os.path.basename(model_file_path) model_index = int( model_file_name.replace("model-", "").replace(".pth", "")) ensemble_model_index = max(ensemble_model_index, model_index + 1) epoch_iterations = ceil(len(train_set) / batch_size) log("train_set_samples: {}, val_set_samples: {}".format( len(train_set), len(val_set))) log() global_val_score_best_avg = float("-inf") sgdr_cycle_val_score_best_avg = float("-inf") lr_scheduler = CosineAnnealingLR(optimizer, T_max=sgdr_cycle_epochs, eta_min=lr_min) optim_summary_writer = SummaryWriter( log_dir="{}/logs/optim".format(output_dir)) train_summary_writer = SummaryWriter( log_dir="{}/logs/train".format(output_dir)) val_summary_writer = SummaryWriter( log_dir="{}/logs/val".format(output_dir)) current_sgdr_cycle_epochs = sgdr_cycle_epochs sgdr_next_cycle_end_epoch = current_sgdr_cycle_epochs + sgdr_cycle_end_prolongation sgdr_iterations = 0 sgdr_cycle_count = 0 batch_count = 0 epoch_of_last_improval = 0 lr_scheduler_plateau = \ ReduceLROnPlateau(optimizer, mode="max", min_lr=lr_min, patience=lr_patience, factor=0.5, threshold=1e-4) lr_scheduler_step = StepLR(optimizer, step_size=10, gamma=0.1) log('{"chart": "best_val_score", "axis": "epoch"}') log('{"chart": "val_score", "axis": "epoch"}') log('{"chart": "val_loss", "axis": "epoch"}') log('{"chart": "sgdr_cycle", "axis": "epoch"}') log('{"chart": "score", "axis": "epoch"}') log('{"chart": "loss", "axis": "epoch"}') log('{"chart": "lr_scaled", "axis": "epoch"}') log('{"chart": "mem_used", "axis": "epoch"}') log('{"chart": "epoch_time", "axis": "epoch"}') train_start_time = time.time() loss_weight = CLASS_WEIGHTS_TENSOR if use_class_weights else None criterion = create_criterion(loss_type, loss_weight, focal_loss_gamma) for epoch in range(epochs_to_train): epoch_start_time = time.time() log("memory used: {:.2f} GB".format(psutil.virtual_memory().used / 2**30)) if use_progressive_image_sizes: next_image_size = \ progressive_image_sizes[min(epoch // progressive_image_epoch_step, len(progressive_image_sizes) - 1)] if train_set.image_size != next_image_size: log("changing image size to {}".format(next_image_size)) train_set.image_size = next_image_size val_set.image_size = next_image_size model.train() train_loss_sum_t = zero_item_tensor() epoch_batch_iter_count = 0 if lr_scheduler_type == "lr_finder": new_lr = lr_max * 0.5**(sgdr_cycle_epochs - min( sgdr_cycle_epochs, sgdr_iterations / epoch_iterations)) adjust_learning_rate(optimizer, new_lr) all_predictions = [] all_targets = [] for b, batch in enumerate(train_set_data_loader): images, categories = \ batch[0].to(device, non_blocking=True), \ batch[1].to(device, non_blocking=True) if lr_scheduler_type == "cosine_annealing": lr_scheduler.step( epoch=min(current_sgdr_cycle_epochs, sgdr_iterations / epoch_iterations)) if b % batch_iterations == 0: optimizer.zero_grad() prediction_logits = model(images) criterion.weight = CLASS_WEIGHTS_TENSOR loss = criterion(prediction_logits, categories) loss.backward() with torch.no_grad(): train_loss_sum_t += loss all_predictions.extend( torch.sigmoid(prediction_logits).cpu().data.numpy()) all_targets.extend(categories.cpu().data.numpy()) if (b + 1) % batch_iterations == 0 or ( b + 1) == len(train_set_data_loader): optimizer.step() sgdr_iterations += 1 batch_count += 1 epoch_batch_iter_count += 1 optim_summary_writer.add_scalar("lr", get_learning_rate(optimizer), batch_count + 1) train_loss_avg = train_loss_sum_t.item() / epoch_batch_iter_count train_score_avg = f1_score_from_probs(torch.tensor(all_predictions), torch.tensor(all_targets)) val_loss_avg, val_score_avg = evaluate(model, val_set_data_loader, criterion) if lr_scheduler_type == "reduce_on_plateau": lr_scheduler_plateau.step(val_score_avg) elif lr_scheduler_type == "step": lr_scheduler_step.step(epoch) model_improved_within_sgdr_cycle = check_model_improved( sgdr_cycle_val_score_best_avg, val_score_avg) if model_improved_within_sgdr_cycle: torch.save( model.state_dict(), "{}/model-{}.pth".format(output_dir, ensemble_model_index)) sgdr_cycle_val_score_best_avg = val_score_avg model_improved = check_model_improved(global_val_score_best_avg, val_score_avg) ckpt_saved = False if model_improved: torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) torch.save(optimizer.state_dict(), "{}/optimizer.pth".format(output_dir)) np.save("{}/train_predictions.npy".format(output_dir), all_predictions) np.save("{}/train_targets.npy".format(output_dir), all_targets) global_val_score_best_avg = val_score_avg epoch_of_last_improval = epoch ckpt_saved = True sgdr_reset = False if (lr_scheduler_type == "cosine_annealing") \ and (epoch + 1 >= sgdr_next_cycle_end_epoch) \ and (epoch - epoch_of_last_improval >= sgdr_cycle_end_patience): sgdr_iterations = 0 current_sgdr_cycle_epochs = int(current_sgdr_cycle_epochs * sgdr_cycle_epochs_mult) sgdr_next_cycle_end_epoch = epoch + 1 + current_sgdr_cycle_epochs + sgdr_cycle_end_prolongation ensemble_model_index += 1 sgdr_cycle_val_score_best_avg = float("-inf") sgdr_cycle_count += 1 sgdr_reset = True new_lr_min = lr_min * (lr_min_decay**sgdr_cycle_count) new_lr_max = lr_max * (lr_max_decay**sgdr_cycle_count) new_lr_max = max(new_lr_max, new_lr_min) adjust_learning_rate(optimizer, new_lr_max) lr_scheduler = CosineAnnealingLR(optimizer, T_max=current_sgdr_cycle_epochs, eta_min=new_lr_min) optim_summary_writer.add_scalar("sgdr_cycle", sgdr_cycle_count, epoch + 1) train_summary_writer.add_scalar("loss", train_loss_avg, epoch + 1) train_summary_writer.add_scalar("score", train_score_avg, epoch + 1) val_summary_writer.add_scalar("loss", val_loss_avg, epoch + 1) val_summary_writer.add_scalar("score", val_score_avg, epoch + 1) epoch_end_time = time.time() epoch_duration_time = epoch_end_time - epoch_start_time log("[%03d/%03d] %ds, lr: %.6f, loss: %.4f, val_loss: %.4f, score: %.4f, val_score: %.4f, ckpt: %d, rst: %d" % (epoch + 1, epochs_to_train, epoch_duration_time, get_learning_rate(optimizer), train_loss_avg, val_loss_avg, train_score_avg, val_score_avg, int(ckpt_saved), int(sgdr_reset))) log('{"chart": "best_val_score", "x": %d, "y": %.4f}' % (epoch + 1, global_val_score_best_avg)) log('{"chart": "val_loss", "x": %d, "y": %.4f}' % (epoch + 1, val_loss_avg)) log('{"chart": "val_score", "x": %d, "y": %.4f}' % (epoch + 1, val_score_avg)) log('{"chart": "sgdr_cycle", "x": %d, "y": %d}' % (epoch + 1, sgdr_cycle_count)) log('{"chart": "loss", "x": %d, "y": %.4f}' % (epoch + 1, train_loss_avg)) log('{"chart": "score", "x": %d, "y": %.4f}' % (epoch + 1, train_score_avg)) log('{"chart": "lr_scaled", "x": %d, "y": %.4f}' % (epoch + 1, 1000 * get_learning_rate(optimizer))) log('{"chart": "mem_used", "x": %d, "y": %.2f}' % (epoch + 1, psutil.virtual_memory().used / 2**30)) log('{"chart": "epoch_time", "x": %d, "y": %d}' % (epoch + 1, epoch_duration_time)) if (sgdr_reset or lr_scheduler_type in ("reduce_on_plateau", "step")) \ and epoch - epoch_of_last_improval >= patience: log("early abort due to lack of improval") break if max_sgdr_cycles is not None and sgdr_cycle_count >= max_sgdr_cycles: log("early abort due to maximum number of sgdr cycles reached") break optim_summary_writer.close() train_summary_writer.close() val_summary_writer.close() train_end_time = time.time() log() log("Train time: %s" % str(datetime.timedelta(seconds=train_end_time - train_start_time))) model.load_state_dict( torch.load("{}/model.pth".format(output_dir), map_location=device)) val_predictions, val_targets = predict(model, val_set_data_loader) np.save("{}/val_predictions.npy".format(output_dir), val_predictions) np.save("{}/val_targets.npy".format(output_dir), val_targets) best_threshold, best_threshold_score, all_threshold_scores = calculate_best_threshold( val_predictions, val_targets) log("All threshold scores: {}".format(all_threshold_scores)) log("Best threshold / score: {} / {}".format(best_threshold, best_threshold_score)) test_data = TestData(input_dir) test_set = TestDataset(test_data.test_set_df, input_dir, image_size, crop_images) test_set_data_loader = \ DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) test_predictions, _ = predict(model, test_set_data_loader) np.save("{}/test_predictions.npy".format(output_dir), test_predictions) predicted_categories = calculate_categories_from_predictions( test_predictions, threshold=best_threshold) submission_df = test_data.test_set_df.copy() submission_df["Predicted"] = [ " ".join(map(str, pc)) for pc in predicted_categories ] submission_df.to_csv("{}/submission.csv".format(output_dir))
def main(args): """main function for training DHS net""" # print(args) # uncomment to test arg inputs bsize = args.batch_size train_dir = args.train_dir test_dir = args.test_dir model_dir = args.ckpt_dir tensorboard_dir = args.tensorboard_dir device = args.device if not os.path.exists(model_dir): os.mkdir(model_dir) train_loader = torch.utils.data.DataLoader(TrainData(train_dir, transform=True), batch_size=bsize, shuffle=True, num_workers=4, pin_memory=True) val_loader = torch.utils.data.DataLoader(TrainData(test_dir, transform=True), batch_size=bsize, shuffle=True, num_workers=4, pin_memory=True) model = SRM() if device == 'gpu': model.cuda() criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) train_loss = [] evaluation = [] result = {'epoch': [], 'F_measure': [], 'MAE': []} progress = tqdm(range(0, args.epochs + 1), miniters=1, ncols=100, desc='Overall Progress', leave=True, position=0) offset = 1 best = 0 writer = SummaryWriter(tensorboard_dir) for epoch in progress: if epoch != 0: print("load parameters") model.load_state_dict(torch.load(model_dir + 'current_network.pth')) optimizer.load_state_dict( torch.load(model_dir + 'current_optimizer.pth')) title = 'Training Epoch {}'.format(epoch) progress_epoch = tqdm(train_loader, ncols=120, total=len(train_loader), smoothing=0.9, miniters=1, leave=True, position=offset, desc=title) for ib, (img, gt) in enumerate(progress_epoch): # inputs = Variable(img).cuda() # GPU version # gt = Variable(gt.unsqueeze(1)).cuda() # GPU version inputs = Variable(img) # CPU version gt = Variable(gt.unsqueeze(1)) # CPU version output1, output2 = model.forward(inputs) output1 = get_pred(output1) output2 = get_pred(output2) loss = criterion(output1, gt) + criterion(output2, gt) model.zero_grad() loss.backward() optimizer.step() train_loss.append(round(float(loss.data.cpu()), 3)) title = '{} Epoch {}/{}'.format('Training', epoch, args.epochs) progress_epoch.set_description(title + ' ' + 'loss:' + str(loss.data.cpu().numpy())) writer.add_scalar('Train/Loss', loss.data.cpu(), epoch) filename = model_dir + 'current_network.pth' filename_opti = model_dir + 'current_optimizer.pth' torch.save(model.state_dict(), filename) # save current model params torch.save(optimizer.state_dict(), filename_opti) # save current optimizer params if epoch % args.val_rate == 0: # start validation params = model_dir + 'current_network.pth' model.load_state_dict(torch.load(params)) pred_list = [] gt_list = [] for img, gt in val_loader: # inputs = Variable(img).cuda() # GPU version inputs = Variable(img) # CPU version _, output = model.forward(inputs) output = get_pred(output) out = output.data.cpu().numpy() pred_list.extend(out) gt = gt.numpy() gt_list.extend(gt) pred_list = np.array(pred_list) pred_list = np.squeeze(pred_list) gt_list = np.array(gt_list) F_measure = get_f_measure(pred_list, gt_list) mae = get_mae(pred_list, gt_list) evaluation.append([int(epoch), float(F_measure), float(mae)]) result['epoch'].append(int(epoch)) result['F_measure'].append(round(float(F_measure), 3)) result['MAE'].append(round(float(mae), 3)) df = pd.DataFrame(result).set_index('epoch') df.to_csv('./eval.csv') if epoch == 0: best = F_measure - mae elif F_measure - mae > best: # save model with best performance best = F_measure - mae filename = ('%s/best_network.pth' % model_dir) filename_opti = ('%s/best_optimizer.pth' % model_dir) torch.save(model.state_dict(), filename) torch.save(optimizer.state_dict(), filename_opti)
def main(): batch_size = 16 generator = Generator().cuda() discriminator = Discriminator(96, 96).cuda() optimizer_G = optim.Adam(generator.parameters(), lr=1e-4) optimizer_D = optim.Adam(discriminator.parameters(), lr=1e-4) # dataset = FaceData('train') dataset = TrainData() data_loader = DataLoader(dataset, batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True) MSE = nn.MSELoss() BCE = nn.BCELoss() # content loss, perceptual loss vgg / i,j == 5,4 vgg_net = vgg19(pretrained=True).features[:36].cuda() vgg_net.eval() for param in vgg_net.parameters(): param.requires_grad = False discriminator.train() generator.train() optimizer_G.zero_grad() optimizer_D.zero_grad() print("Start Training") current_epoch = 0 for epoch in range(current_epoch, 100): for step, (img_Input, img_GT) in tqdm(enumerate(data_loader)): img_GT = img_GT.cuda() img_Input = img_Input.cuda() # # Discriminator update # img_SR = generator(img_Input) # fake = discriminator(img_SR) # real = discriminator(img_GT) # loss_Dfake = 0.001 * BCE(fake, torch.zeros(batch_size, 1).cuda()) # loss_Dreal = 0.001 * BCE(real, torch.ones(batch_size, 1).cuda()) # loss_D = 0.001 * (loss_Dfake + loss_Dreal) # # if epoch > 0: # discriminator.zero_grad() # loss_D.backward(retain_graph=True) # optimizer_D.step() # # Generator update # img_SR = generator(img_Input) # loss_content = MSE(img_SR, img_GT) # loss_vgg = 0.006 * MSE(vgg_net(img_SR), vgg_net(img_GT)) # fake = discriminator(img_SR) # loss_Dfake = 0.001 * BCE(fake, torch.zeros(batch_size, 1).cuda()) # loss_G = loss_content + loss_vgg + loss_Dfake # generator.zero_grad() # loss_G.backward() # # loss_Dfake.backward() # optimizer_G.step() if epoch < 10: # SRResnet Initialize Generator update generator.zero_grad() img_SR = generator(img_Input) loss_content = MSE(img_SR, img_GT) loss_content.backward() optimizer_G.step() if step % 100 == 0: print() print("Loss_content : {}".format(loss_content.item())) continue # Discriminator update discriminator.zero_grad() D_real = discriminator(img_GT) loss_Dreal = 0.1 * BCE(D_real, torch.ones(batch_size, 1).cuda()) loss_Dreal.backward() D_x = D_real.mean().item() img_SR = generator(img_Input) D_fake = discriminator(img_SR.detach()) loss_Dfake = 0.1 * BCE(D_fake, torch.zeros(batch_size, 1).cuda()) loss_Dfake.backward() DG_z = D_fake.mean().item() loss_D = (loss_Dfake + loss_Dreal) optimizer_D.step() # Generator update generator.zero_grad() loss_content = MSE(img_SR, img_GT) loss_vgg = MSE(vgg_net(img_SR), vgg_net(img_GT)) # img_SR = generator(img_Input) G_fake = discriminator(img_SR) loss_Gfake = BCE(G_fake, torch.zeros(batch_size, 1).cuda()) loss_G = loss_content + 0.006 * loss_vgg + 0.001 * loss_Gfake loss_G.backward() # loss_Dfake.backward() optimizer_G.step() if step % 100 == 0: # :.10f print() print("fake out : {}".format(DG_z)) print("real out : {}".format(D_x)) print("Loss_Dfake : {}".format(loss_Dfake.item())) print("Loss_Dreal : {}".format(loss_Dreal.item())) print("Loss_D : {}".format(loss_D.item())) print("Loss_content : {}".format(loss_content.item())) print("Loss_vgg : {}".format(0.006 * loss_vgg.item())) print("Loss_Gfake : {}".format(0.001 * loss_Gfake.item())) print("Loss_G : {}".format(loss_G.item())) print("Loss_Total : {}".format((loss_G + loss_D).item())) # print("Loss_D : {:.4f}".format(loss_D.item())) # print("Loss : {:.4f}".format(loss_total.item())) with torch.no_grad(): generator.eval() save_image(denorm(img_SR[0].cpu()), "./Result/{0}_SR.png".format(epoch)) save_image(denorm(img_GT[0].cpu()), "./Result/{0}_GT.png".format(epoch)) generator.train()