def flickr_objective_50(param): from model import KBPRModel from utils import early_stop, flickr, preprocess import gc, theano.misc.pkl_utils from hyperopt import STATUS_OK import cStringIO n_items, n_users, train_dict, valid_dict, test_dict, exclude_dict = \ preprocess(flickr(), portion=[8, 1, 1, 0]) flickr_50 = KBPRModel(50, n_users, n_items, per_user_sample=50, learning_rate=0.1, variance_mu=1, update_mu=True, lambda_variance=1, use_warp=True, **param) best_metric, best_model = early_stop(flickr_50, train_dict, lambda m: -m.recall(valid_dict, train_dict, n_users=3000)[0], n_epochs=10000, patience=500, validation_frequency=100) output = cStringIO.StringIO() theano.misc.pkl_utils.dump(best_model, output) del flickr_50 gc.collect() return {"loss": best_metric, "attachments": {"model": output.getvalue()}, "status": STATUS_OK}
def main(): args = parse_args() train_dataset, test_dataset = dataset.get_dataset(args.path, args.use_augmentation, args.use_fivecrop) train_loader = DataLoader(train_dataset, args.batch, True, num_workers=args.worker, pin_memory=True) test_loader = DataLoader(test_dataset, args.batch, False, num_workers=args.worker, pin_memory=True) if args.cuda: torch.cuda.set_device(0) device = torch.device('cuda') else: device = torch.device('cpu') if args.model == 'ResNet18': mymodel = model.ResNet18(args.frozen_layers).to(device) elif args.model == 'ResNet34': mymodel = model.ResNet34(args.frozen_layers).to(device) elif args.model == 'ResNet50': mymodel = model.ResNet50(args.frozen_layers).to(device) elif args.model == 'DenseNet': mymodel = model.DenseNet().to(device) else: pass op = optim.Adam(mymodel.parameters(), lr=args.lr) train_losses, test_mF1s, test_precisions, test_recalls = [], [], [], [] early = args.early for i in range(args.epoch): train_loss = train.train(mymodel, op, train_loader, i, device, args.log, utils.pos_weight) mF1, recall, presicion = test.test(mymodel, test_loader, device, args.use_fivecrop) train_losses.append(train_loss) test_mF1s.append(mF1) test_precisions.append(presicion) test_recalls.append(recall) early = utils.early_stop(test_mF1s, early) if early <= 0: break utils.save_log(mymodel, train_losses, test_mF1s, test_precisions, test_recalls)
else: metrics_te = defaultdict(float) metrics_tr = {'loss': loss} metrics_all = (metrics, metrics_te, metrics_tr) for name in metrics_all[0].keys(): metrics_hist[name].append(metrics_all[0][name]) for name in metrics_all[1].keys(): metrics_hist_te[name].append(metrics_all[1][name]) for name in metrics_all[2].keys(): metrics_hist_tr[name].append(metrics_all[2][name]) metrics_hist_all = (metrics_hist, metrics_hist_te, metrics_hist_tr) save_everything(args, metrics_hist_all, model, model_dir, None, args.criterion, test_only) sys.stdout.flush() if test_only: break if args.criterion in metrics_hist.keys(): if early_stop(metrics_hist, args.criterion, args.patience): #stop training, do tests on test and train sets, and then stop the script print("%s hasn't improved in %d epochs, early stopping..." % (args.criterion, args.patience)) test_only = True args.test_model = '%s/model_best_%s.pth' % (model_dir, args.criterion) model = pick_model(args, dicts)
def train_f(self, x, y, validation_split=0.2, batch_size=0.2, epochs=1, verbose=1): n_train = int(len(y) * (1 - validation_split)) xe_train = x[:n_train] yf_train = y[:n_train] xe_val = x[n_train:] yf_val = y[n_train:] n_batches = int(n_train / batch_size) if n_train % batch_size != 0: n_batches += 1 history = { 'loss': [], 'mae': [], 'rmse': [], 'val_loss': [], 'val_mae': [], 'val_rmse': [] } for e in range(epochs): start_epoch_time = time.time() loss = 0.0 mae = 0.0 for b in range(n_batches): xe = xe_train[b * batch_size: (b + 1) * batch_size] yf = yf_train[b * batch_size: (b + 1) * batch_size] input_feed = { self._xe: xe, self._yf: yf } output_feed = [self._loss_f, self._mae_f, self._train_op_f] l, m, _ = self._sess.run(output_feed, input_feed) loss += l mae += m loss /= n_batches mae /= n_batches history['loss'].append(loss) history['mae'].append(mae) val_loss, val_mae = self._sess.run([self._loss_f, self._mae_f], { self._xe: xe_val, self._yf: yf_val }) history['val_loss'].append(val_loss) history['val_mae'].append(val_mae) epoch_time = time.time() - start_epoch_time if verbose > 0: print( "Epoch {}/{}: time={:.2f}s, loss={:.5f}, mae={:.5f}, val_loss={:.5f}, val_mae={:.5f}".format( e + 1, epochs, epoch_time, loss, mae, val_loss, val_mae)) if utils.early_stop(history['val_loss'], e, patience=self._patience): print('Early stop at epoch', (e + 1)) break if np.isnan(loss): break return history
def main(): # set env setproctitle.setproctitle(f"train{CONFIG['name']}") os.environ["CUDA_VISIBLE_DEVICES"] = CONFIG['gpu_id'] device = torch.device('cuda') # fix seed seed = 123 random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True # load data bundle_train_data, bundle_test_data, item_data, assist_data = \ dataset.get_dataset(CONFIG['path'], CONFIG['dataset_name'], task=CONFIG['task']) train_loader = DataLoader(bundle_train_data, 2048, True, num_workers=8, pin_memory=True) test_loader = DataLoader(bundle_test_data, 4096, False, num_workers=16, pin_memory=True) # pretrain if 'pretrain' in CONFIG: pretrain = torch.load(CONFIG['pretrain'], map_location='cpu') print('load pretrain') # graph ub_graph = bundle_train_data.ground_truth_u_b ui_graph = item_data.ground_truth_u_i bi_graph = assist_data.ground_truth_b_i # metric metrics = [ Recall(20), NDCG(20), Recall(40), NDCG(40), Recall(80), NDCG(80) ] TARGET = 'Recall@10' # loss loss_func = loss.BPRLoss('mean') # log log = logger.Logger(os.path.join(CONFIG['log'], CONFIG['dataset_name'], f"{CONFIG['model']}_{CONFIG['task']}", TAG), 'best', checkpoint_target=TARGET) theta = 0.6 time_path = time.strftime("%y%m%d-%H%M%S", time.localtime(time.time())) for lr, decay, message_dropout, node_dropout \ in product(CONFIG['lrs'], CONFIG['decays'], CONFIG['message_dropouts'], CONFIG['node_dropouts']): # vis = VisShow('localhost', 16666, # f'{CONFIG['dataset_name']}-{MODELTYPE.__name__}-{decay}-{lr}-{theta}-3layer') visual_path = os.path.join( CONFIG['visual'], CONFIG['dataset_name'], f"{CONFIG['model']}_{CONFIG['task']}", f"{time_path}@{CONFIG['note']}", f"lr{lr}_decay{decay}_medr{message_dropout}_nodr{node_dropout}") # model if CONFIG['model'] == 'BGCN': model_type = '1model_3feature' graph = [ub_graph, ui_graph, bi_graph] info = BGCN_Info(64, decay, message_dropout, node_dropout, 1) model = BGCN(info, assist_data, graph, device, pretrain=None).to(device) assert model.__class__.__name__ == CONFIG['model'] # op op = optim.Adam(model.parameters(), lr=lr) # env env = { 'lr': lr, 'op': str(op).split(' ')[0], # Adam 'dataset': CONFIG['dataset_name'], 'model': CONFIG['model'], 'sample': CONFIG['sample'], } # print(info) # continue train if CONFIG['sample'] == 'hard' and 'conti_train' in CONFIG: model.load_state_dict(torch.load(CONFIG['conti_train'])) print('load model and continue to train') retry = CONFIG['retry'] # =1 while retry >= 0: # log log.update_modelinfo(info, env, metrics) try: # train & test early = CONFIG['early'] train_writer = SummaryWriter(log_dir=visual_path, comment='train') test_writer = SummaryWriter(log_dir=visual_path, comment='test') for epoch in range(CONFIG['epochs']): # train trainloss = train(model, epoch + 1, train_loader, op, device, CONFIG, loss_func) train_writer.add_scalars('loss/single', {"loss": trainloss}, epoch) # vis.update('train loss', [epoch], [trainloss]) # test if epoch % CONFIG['test_interval'] == 0: output_metrics = test(model, epoch + 1, test_loader, device, CONFIG, metrics) for metric in output_metrics: test_writer.add_scalars( 'metric/all', {metric.get_title(): metric.metric}, epoch) if metric == output_metrics[0]: test_writer.add_scalars( 'metric/single', {metric.get_title(): metric.metric}, epoch) # log log.update_log(metrics, model) # # show(log.metrics_log) # check overfitting if epoch > 10: if check_overfitting(log.metrics_log, TARGET, 1, show=False): break # early stop early = early_stop(log.metrics_log[TARGET], early, threshold=0) if early <= 0: break train_writer.close() test_writer.close() log.close_log(TARGET) retry = -1 except RuntimeError: retry -= 1 log.close()