def main(): args = parse_arguments() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) worker_init = WorkerInitObj(args.seed) device, args = setup_training(args) test_data = prepare_test_data(args) model, optimizer, criterion = prepare_model_and_optimizer(args, device) pool = ProcessPoolExecutor(1) train_iter = ml_1mTrainDataLoader(path=args.train_path, num_negs=args.num_negs, batch_size=args.train_batch_size, seed=args.seed, worker_init=worker_init) print('-' * 50 + 'args' + '-' * 50) for k in list(vars(args).keys()): print('{0}: {1}'.format(k, vars(args)[k])) print('-' * 30) print(model) print('-' * 50 + 'args' + '-' * 50) global_step = 0 global_HR = 0.0 global_NDCG = 0.0 s_time_train = time.time() for epoch in range(args.epoch): dataset_future = pool.submit(ml_1mTrainDataLoader, args.train_path, args.num_negs, args.train_batch_size, args.seed, worker_init) for step, batch in enumerate(train_iter): model.train() batch = [t.to(device) for t in batch] users, items, labels = batch logits = model(users, items) loss = criterion(logits, labels.float()) optimizer.zero_grad() loss.backward() optimizer.step() #evaluate if global_step != 0 and global_step % args.eval_freq == 0: s_time_eval = time.time() model.eval() hits, ndcgs = evaluate(model, test_data, device, args.topk) e_time_eval = time.time() print('-' * 68) print('Epoch:[{0}] Step:[{1}] HR:[{2}] NDCG:[{3}] time:[{4}s]'. format(epoch, global_step, format(hits, '.4f'), format(ndcgs, '.4f'), format(e_time_eval - s_time_eval, '.4f'))) if hits > global_HR and ndcgs > global_NDCG: model_to_save = model.module if hasattr( model, 'module') else model output_save_file = os.path.join( args.output_dir, "{}_hr_{}_ndcg_{}_step_{}_ckpt.pt".format( args.model_name, format(hits, '.4f'), format(ndcgs, '.4f'), global_step)) if os.path.exists(output_save_file): os.system('rm -rf {}'.format(output_save_file)) torch.save( { 'model': model_to_save.state_dict(), 'name': args.model_name }, output_save_file) print('Epoch:[{0}] Step:[{1}] SavePath:[{2}]'.format( epoch, global_step, output_save_file)) global_HR = hits global_NDCG = ndcgs print('-' * 68) #log if global_step != 0 and global_step % args.log_freq == 0: e_time_train = time.time() print('Epoch:[{0}] Step:[{1}] Loss:[{2}] Lr:[{3}] time:[{4}s]'. format(epoch, global_step, format(loss.item(), '.4f'), format(optimizer.param_groups[0]['lr'], '.6'), format(e_time_train - s_time_train, '.4f'))) s_time_train = time.time() global_step += 1 del train_iter train_iter = dataset_future.result(timeout=None)
def main(): args = parse_arguments() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) os.environ['PYTHONHASHSEED'] = str(args.seed) torch.cuda.manual_seed_all(args.seed) worker_init = WorkerInitObj(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.enabled = True device, args = setup_training(args) model, optimizer, criterion = prepare_model_and_optimizer(args, device) pool = ProcessPoolExecutor(1) train_iter = subsetDataloader(path=args.train_path, batch_size=args.batch_size, worker_init=worker_init) test_iter = subsetDataloader(path=args.val_path, batch_size=args.batch_size, worker_init=worker_init) print('-' * 50 + 'args' + '-' * 50) for k in list(vars(args).keys()): print('{0}: {1}'.format(k, vars(args)[k])) print('-' * 30) print(model) print('-' * 50 + 'args' + '-' * 50) global_step = 0 global_auc = 0 s_time_train = time.time() for epoch in range(args.epoch): dataset_future = pool.submit(subsetDataloader, args.train_path, args.batch_size, worker_init) for step, batch in enumerate(train_iter): model.train() labels = batch['label'].to(device).float() batch = { t: {k: v.to(device) for k, v in d.items()} for t, d in batch.items() if isinstance(d, dict) } optimizer.zero_grad() logits = model(batch) # print('logits', logits) # print('label', labels) loss = criterion(logits, labels) loss.backward() optimizer.step() # evaluate if global_step != 0 and global_step % args.eval_freq == 0: s_time_eval = time.time() model.eval() auc = evaluate(model, test_iter, device) e_time_eval = time.time() print('-' * 68) print('Epoch:[{0}] Step:[{1}] AUC:[{2}] time:[{3}s]'.format( epoch, global_step, format(auc, '.4f'), format(e_time_eval - s_time_eval, '.4f'))) if auc > global_auc: model_to_save = model.module if hasattr( model, 'module') else model output_save_file = os.path.join( args.output_dir, "{}_auc_{}_step_{}_ckpt.pt".format( args.model_name, format(auc, '.4f'), global_step)) if os.path.exists(output_save_file): os.system('rm -rf {}'.format(output_save_file)) torch.save( { 'model': model_to_save.state_dict(), 'name': args.model_name }, output_save_file) print('Epoch:[{0}] Step:[{1}] SavePath:[{2}]'.format( epoch, global_step, output_save_file)) global_auc = auc print('-' * 68) # log if global_step != 0 and global_step % args.log_freq == 0: e_time_train = time.time() print('Epoch:[{0}] Step:[{1}] Loss:[{2}] Lr:[{3}] time:[{4}s]'. format(epoch, global_step, format(loss.item(), '.4f'), format(optimizer.param_groups[0]['lr'], '.6'), format(e_time_train - s_time_train, '.4f'))) s_time_train = time.time() global_step += 1 del train_iter train_iter = dataset_future.result(timeout=None)