def test(model_name: str, dataset_folder: str, save_folder: str, hypers: HyperParameters, batch_size: Optional[int], max_num_batches: Optional[int], series: DataSeries = DataSeries.TEST): # Create the dataset dataset = get_dataset(hypers.dataset_type, dataset_folder) # Build model and restore trainable parameters model = get_model(hypers, save_folder=save_folder, is_train=False) model.restore(name=model_name, is_train=False, is_frozen=False) # Test the model print('Starting evaluation on {0} set...'.format(series.name.capitalize())) test_results = model.predict(dataset=dataset, test_batch_size=batch_size, max_num_batches=max_num_batches, series=series) # Close the dataset dataset.close() if series == DataSeries.TRAIN: result_file = os.path.join(save_folder, FINAL_TRAIN_LOG_PATH.format(model_name)) elif series == DataSeries.VALID: result_file = os.path.join(save_folder, FINAL_VALID_LOG_PATH.format(model_name)) else: result_file = os.path.join(save_folder, TEST_LOG_PATH.format(model_name)) save_by_file_suffix([test_results], result_file) print('Completed evaluation.')
def main(): args = get_args() cfg = Config.fromfile(args.config) cfg.fold = args.fold global device cfg.device = device log.info(cfg) # torch.cuda.set_device(cfg.gpu) util.set_seed(cfg.seed) log.info(f'setting seed = {cfg.seed}') # setup ------------------------------------- for f in ['checkpoint', 'train', 'valid', 'test', 'backup']: os.makedirs(cfg.workdir + '/' + f, exist_ok=True) if 0: #not work perfect file.backup_project_as_zip( PROJECT_PATH, cfg.workdir + '/backup/code.train.%s.zip' % IDENTIFIER) ## model ------------------------------------ model = model_factory.get_model(cfg) # multi-gpu---------------------------------- if torch.cuda.device_count() > 1 and len(cfg.gpu) > 1: log.info(f"Let's use {torch.cuda.device_count()} GPUs!") model = nn.DataParallel(model) model.to(device) ## train model------------------------------- do_train(cfg, model)
def main(args): torch.cuda.set_device(args.gpu) dataset = load_corafull_amazon_dataset(args) continuum_data = continuum_corafull_amazon_dataset(dataset, args) task_manager = semi_task_manager(continuum_data.dataset_info()) g, features, labels, train_mask, val_mask, test_mask = dataset task_manager.add_g(g) model = get_model(dataset, args, task_manager).cuda() life_model = importlib.import_module(f'LifeModel.{args.method}_model') life_model_ins = life_model.NET(model, task_manager, args) acc_matrix = np.zeros([args.n_tasks, args.n_tasks]) meanas = [] prev_model = None for task_i, dataset_i in enumerate(continuum_data): current_task, (g, features, labels, train_mask, val_mask, test_mask) = dataset_i task_manager.add_task(current_task, {"test_mask": test_mask}) label_offset1, label_offset2 = task_manager.get_label_offset( current_task) dur = [] for epoch in range(args.epochs): if args.method == 'lwf': life_model_ins.observe(features, labels, task_i, train_mask, prev_model) else: life_model_ins.observe(features, labels, task_i, train_mask) acc_mean = [] for t in range(task_i + 1): test_mask = task_manager.retrieve_task(t)['test_mask'] label_offset1, label_offset2 = task_manager.get_label_offset(t) acc = evaluate(model, features, labels, test_mask, label_offset1, label_offset2) acc_matrix[task_i][t] = round(acc * 100, 2) acc_mean.append(acc) print(f"T{t:02d} {acc*100:.2f}|", end="") accs = acc_mean[:task_i + 1] meana = round(np.mean(accs) * 100, 2) meanas.append(meana) acc_mean = round(np.mean(acc_mean) * 100, 2) print(f"acc_mean: {acc_mean}", end="") print() prev_model = copy.deepcopy(life_model_ins).cuda() print('AP: ', acc_mean) backward = [] forward = [] for t in range(args.n_tasks - 1): b = acc_matrix[args.n_tasks - 1][t] - acc_matrix[t][t] backward.append(round(b, 2)) mean_backward = round(np.mean(backward), 2) print('AF: ', mean_backward)
def make_submission(cfg): predictions = [] # setting dataset --------------------------- loader_test = dataset_factory.get_dataloader(cfg.data.test) ## model ------------------------------------ model = model_factory.get_model(cfg) util.load_model(model_paths[0], model) model.to(device) model.eval() train_df = pd.read_csv(cfg.train_csv) regr_model = kaggle.get_regr_model(train_df) for img, _, _ in tqdm(loader_test): with torch.no_grad(): output = model(img.to(device)) output = output.data.cpu().numpy() for out in output: coords = kaggle.extract_coords(out, regr_model) s = kaggle.coords2str(coords) predictions.append(s) test = pd.read_csv(cfg.data.test.dataframe) test['PredictionString'] = predictions test.to_csv('predictions.csv', index=False) log.info(test.head())
def run(config): model = get_model(config).cuda() criterion = get_loss(config) optimizer = get_optimizer(config, model.parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, score = utils.checkpoint.load_checkpoint(config, model, checkpoint) else: print('[*] no checkpoint found') last_epoch, score = -1, -1 print('last epoch:{} score:{:.4f}'.format(last_epoch, score)) optimizer.param_groups[0]['initial_lr'] = config.OPTIMIZER.LR scheduler = get_scheduler(config, optimizer, last_epoch) if last_epoch != -1: scheduler.step() writer = SummaryWriter(os.path.join(config.TRAIN_DIR, 'logs')) train_loader = get_dataloader(config, 'train', transform=transforms.Compose([Albu(), Normalize(), ToTensor()])) test_loader = get_dataloader(config, 'val', transform=transforms.Compose([Normalize(), ToTensor()])) train(config, model, train_loader, test_loader, criterion, optimizer, scheduler, writer, last_epoch+1, score)
def _infer(model, data): start = time.time() ################################################################################ print('test preprocessing start!') # # data: [a, b, c,...] data_bc = [] bc_func, _ = preprocess_dict["ben_clahe"] for d in data: d = cv2.resize(d, (704, 544)) data_bc.append(bc_func(d)) # del d # del data ellapsed = time.time() - start print('test preprocessing time: %d hours %d minutes %d seconds' % (ellapsed // 3600, (ellapsed % 3600) // 60, (ellapsed % 3600) % 60)) print('test preprocessing ended!') del data ################################################################################ # n_ensemble = len(ensemble_checkpoints) final = [] for sess, ckpt, config_path in ensemble_checkpoints: config = utils.config.load(config_path) model = get_model(config).cuda() bind_model(model) nsml.load(checkpoint=ckpt, session=sess) # data_processed = [] # _func, _ = preprocess_dict[config.DATA.PREPROCESS] # for d in data: # d = cv2.resize(d, (config.DATA.IMG_W, config.DATA.IMG_H)) # data_processed.append(_func(d)) out = run(model, data_bc, config) final.append(out) del model # final = sum(final) / float(n_ensemble) final = sum(final) final = np.argmax(final, axis=1) print(final.shape) print(final) ellapsed = time.time() - start print('Total inference time: %d hours %d minutes %d seconds' % (ellapsed // 3600, (ellapsed % 3600) // 60, (ellapsed % 3600) % 60)) return final
def run(config, num_checkpoint, epoch_end, output_filename): dataloader = get_dataloader(config, split='val', transform=None) model = get_model(config).cuda() checkpoints = get_checkpoints(config, num_checkpoint, epoch_end) utils.checkpoint.load_checkpoint(config, model, checkpoints[0]) for i, checkpoint in enumerate(checkpoints[1:]): model2 = get_model(config).cuda() last_epoch, _, _ = utils.checkpoint.load_checkpoint(config, model2, checkpoint) swa.moving_average(model, model2, 1. / (i + 2)) with torch.no_grad(): swa.bn_update(dataloader, model) # output_name = '{}.{}.{:03d}'.format(output_filename, num_checkpoint, last_epoch) # print('save {}'.format(output_name)) utils.checkpoint.save_checkpoint(config, model, None, None, epoch_end, weights_dict={'state_dict': model.state_dict()}, name=output_filename)
def _get_model(config, inp, label, bsize, is_training, name_scope, reuse): """Builds models.""" model_cls = 'resnet' trn_kwargs = { 'is_training': is_training, 'inp': inp, 'label': label, 'batch_size': bsize, } with tf.name_scope(name_scope): with tf.variable_scope('Model', reuse=reuse): m = get_model(model_cls, config, **trn_kwargs) return m
def run(config): model = get_model(config).to(device) criterion = get_loss(config.LOSS.NAME) optimizer = get_optimizer(config, model.parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, score, loss = utils.checkpoint.load_checkpoint( config, model, checkpoint) else: print('[*] no checkpoint found') last_epoch, score, loss = -1, -1, float('inf') print('last epoch:{} score:{:.4f} loss:{:.4f}'.format( last_epoch, score, loss)) optimizer.param_groups[0]['initial_lr'] = config.OPTIMIZER.LR scheduler = get_scheduler(config, optimizer, last_epoch) if config.SCHEDULER.NAME == 'multi_step': milestones = scheduler.state_dict()['milestones'] step_count = len([i for i in milestones if i < last_epoch]) optimizer.param_groups[0]['lr'] *= scheduler.state_dict( )['gamma']**step_count if last_epoch != -1: scheduler.step() writer = SummaryWriter(os.path.join(config.TRAIN_DIR, 'logs')) train_loader = get_dataloader(config, 'train', transform=transforms.Compose([ Albu(), CV2_Resize(config.DATA.IMG_W, config.DATA.IMG_H), Normalize(), ToTensor() ])) val_loader = get_dataloader(config, 'val', transform=transforms.Compose([ CV2_Resize(config.DATA.IMG_W, config.DATA.IMG_H), Normalize(), ToTensor() ])) train(config, model, train_loader, val_loader, criterion, optimizer, scheduler, writer, last_epoch + 1, score, loss)
def forward_model(best_model, method): args = best_model['args'] torch.cuda.set_device(args.gpu) set_seed(args) # load and preprocess dataset all_data = load_dataset(args) training = all_data[:int(len(all_data) * 0.7)] validation = all_data[int(len(all_data) * 0.7):int(len(all_data) * 0.8)] testing = all_data[int(len(all_data) * 0.8):] train_loader = DataLoader(training, batch_size=1000, shuffle=True, collate_fn=collate) val_loader = DataLoader(validation, batch_size=1000, shuffle=True, collate_fn=collate) test_loader = DataLoader(testing, batch_size=4000, shuffle=False, collate_fn=collate) dataset = (None, np.zeros((15, 15)), np.zeros( (1, args.num_factors)), None, None, None, None) # create model model = get_model(dataset, args, mode='multilabel').cuda() for step, (g, labels, gt_adjs) in enumerate(test_loader): model.load_state_dict(best_model['model_state_dict']) model.eval() # update the new graph model.g = g features = g.ndata['feat'].float().cuda() labels = labels.cuda() logits = model(features) #.view(-1, n_class, n_latent) hidden = model.get_hidden_feature() matrix = hidden[0] # #sample x dim correlation = np.zeros((matrix.shape[1], matrix.shape[1])) for i in range(matrix.shape[1]): for j in range(matrix.shape[1]): cof = scipy.stats.pearsonr(matrix[:, i], matrix[:, j])[0] correlation[i][j] = cof plot_corr(np.abs(correlation), save=f'{method}.png')
def evaluate(model_enum = Models.Vgg16GAP, dataset_enum = Datasets.cityscapes, loader_split = LoaderSplit.val): # Set dataset dataset = get_loader(dataset_enum, LoaderType.classification, loader_split) dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0) # Set up model model = get_model(dataset_enum, model_enum) model.train() model.load() model.to(device) output_name = model_enum.name + '_' + dataset_enum.name + '_' + loader_split.name evaluate_model(model, dataloader, output_name)
def _get_assign_weighted_model(config, inp, label, weights, weights_dict, bsize, is_training, name_scope, reuse): """Builds models.""" model_cls = 'assign-wts-resnet' trn_kwargs = { 'is_training': is_training, 'inp': inp, 'label': label, 'ex_wts': weights, 'batch_size': bsize } with tf.name_scope(name_scope): with tf.variable_scope('Model', reuse=reuse): m = get_model(model_cls, config, weights_dict, **trn_kwargs) return m
def main(): seed_everything() pprint.pprint(config, indent=2) model = get_model(config).cuda() bind_model(model) args = get_args() if args.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if args.mode == 'train': ### training mode일 때 print('Training Start...') nsml.load(checkpoint='18', session='team146/KHD2019_FUNDUS/20') nsml.save(0) exit()
def main(): seed_everything() config = utils.config.load(ensemble_checkpoints[0][2]) model = get_model(config).cuda() bind_model(model) args = get_args() if args.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if args.mode == 'train': ### training mode일 때 print('Training Start...') nsml.load(session=ensemble_checkpoints[0][0], checkpoint=ensemble_checkpoints[0][1]) nsml.save(0) exit()
def __init__(self, device, model_name, logger): """ Creates and opens the Neural Compute device and creates a graph that can execute inferences on it. """ if device is None: raise Exception("No devices found.") else: self.device = device self.logger = logger # Init model self.logger.info("Initializing %s model.", model_name) self.model = model_factory.get_model(model_name.lower()) graph_file_path = self.model.graph_path # Load graph file start_time = time.time() try: with open(graph_file_path, mode="rb") as graph_file: in_memory_graph = graph_file.read() except Exception: self.logger.error("Error reading graph file: %s.", graph_file_path) raise self.graph = None self.input_fifo = None self.output_fifo = None self.initialization_queue.put(0) self.graph = mvnc2.Graph("mvnc2 graph") self.input_fifo, self.output_fifo = self.graph.allocate_with_fifos(self.device, in_memory_graph, input_fifo_data_type=mvnc2.FifoDataType.FP16, output_fifo_data_type=mvnc2.FifoDataType.FP16) _ = self.initialization_queue.get() graph_alloc_time = (time.time() - start_time) * 1000 self.logger.info("Graph allocated in %f ms.", graph_alloc_time) if self.graph is None or self.input_fifo is None or self.output_fifo is None: raise Exception("Could not initialize device.") self.inference_results = 0
def train(data_folder: str, save_folder: str, hypers: HyperParameters, should_print: bool, max_epochs: Optional[int] = None) -> str: model = get_model(hypers, save_folder=save_folder, is_train=True) # Create dataset dataset = get_dataset(hypers.dataset_type, data_folder) if max_epochs is not None: hypers.epochs = max_epochs # Train the model train_label = model.train(dataset=dataset, should_print=should_print) # Close the dataset files dataset.close() return train_label
def run(config): model = get_model(config).to(device) # model_params = [{'params': model.encoder.parameters(), 'lr': config.OPTIMIZER.ENCODER_LR}, # {'params': model.decoder.parameters(), 'lr': config.OPTIMIZER.DECODER_LR}] optimizer = get_optimizer(config, model.parameters()) # optimizer = get_optimizer(config, model_params) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, score, loss = utils.checkpoint.load_checkpoint(config, model, checkpoint) else: print('[*] no checkpoint found') last_epoch, score, loss = -1, -1, float('inf') print('last epoch:{} score:{:.4f} loss:{:.4f}'.format(last_epoch, score, loss)) optimizer.param_groups[0]['initial_lr'] = config.OPTIMIZER.LR # optimizer.param_groups[0]['initial_lr'] = config.OPTIMIZER.ENCODER_LR # optimizer.param_groups[1]['initial_lr'] = config.OPTIMIZER.DECODER_LR scheduler = get_scheduler(config, optimizer, last_epoch) if config.SCHEDULER.NAME == 'multi_step': milestones = scheduler.state_dict()['milestones'] step_count = len([i for i in milestones if i < last_epoch]) optimizer.param_groups[0]['lr'] *= scheduler.state_dict()['gamma'] ** step_count # optimizer.param_groups[0]['lr'] *= scheduler.state_dict()['gamma'] ** step_count # optimizer.param_groups[1]['lr'] *= scheduler.state_dict()['gamma'] ** step_count if last_epoch != -1: scheduler.step() log_train = Logger() log_val = Logger() log_train.open(os.path.join(config.TRAIN_DIR, 'log_train.txt'), mode='a') log_val.open(os.path.join(config.TRAIN_DIR, 'log_val.txt'), mode='a') train_loader = get_dataloader(config, 'train', transform=Albu(config.ALBU)) val_loader = get_dataloader(config, 'val') train(config, model, train_loader, val_loader, optimizer, scheduler, log_train, log_val, last_epoch+1, score, loss)
def __init__(self, kind, device=None): """ args: kind (str): one of {'sk', 'sw'} device (torch.device) defice to use in inference """ self._kind = kind self._transform = Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.732], std=[0.129]) ]) if device: self._device = device else: self._device = torch.device('cpu') self._model = get_model(kind=kind, device=self._device, cache=True) self._model.eval() self._radius = config['DATA INGESTION'].getint('radius')
def run(config): model = get_model(config).to(device) optimizer = get_optimizer(config, model.parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, score, loss = utils.checkpoint.load_checkpoint( config, model, checkpoint) else: print('[*] no checkpoint found') last_epoch, score, loss = -1, -1, float('inf') print('last epoch:{} score:{:.4f} loss:{:.4f}'.format( last_epoch, score, loss)) optimizer.param_groups[0]['initial_lr'] = config.OPTIMIZER.LR scheduler = get_scheduler(config, optimizer, last_epoch) if config.SCHEDULER.NAME == 'multi_step': milestones = scheduler.state_dict()['milestones'] step_count = len([i for i in milestones if i < last_epoch]) optimizer.param_groups[0]['lr'] *= scheduler.state_dict( )['gamma']**step_count if last_epoch != -1: scheduler.step() # writer = SummaryWriter(os.path.join(config.TRAIN_DIR, 'logs')) log_train = Logger() log_val = Logger() log_train.open(os.path.join(config.TRAIN_DIR, 'log_train.txt'), mode='a') log_val.open(os.path.join(config.TRAIN_DIR, 'log_val.txt'), mode='a') augmentation = Albu_Seg() if config.TASK == 'seg' else Albu_Cls() train_loader = get_dataloader(config, 'train', transform=augmentation) val_loader = get_dataloader(config, 'val') train(config, model, train_loader, val_loader, optimizer, scheduler, log_train, log_val, last_epoch + 1, score, loss)
# resnest # for model_name in model_names: # print('model_name',model_name) # model = get_model(model_name=model_name,input_shape=input_shape,n_classes=n_classes, # verbose=True,fc_activation=fc_activation,using_cb=True) # print('-'*10) #RegNetY600 set # model = get_model(model_name="RegNet",input_shape=input_shape,n_classes=n_classes, # verbose=True,fc_activation=fc_activation,stage_depth=[1,3,7,4], # stage_width=[48,112,256,608],stage_G=16,SEstyle_atten="SE",active='mish') # print('-'*10) #DETR # model_name = 'res34_DETR' # print('model_name',model_name) # model = get_model(model_name=model_name,input_shape=input_shape, # n_classes=n_classes,verbose=True,training=None, # fc_activation=fc_activation) # print('-'*10) model_names = ['ResNest50_DETR', 'res34_DETR'] for model_name in model_names: print('model_name', model_name) model = get_model(model_name=model_name, input_shape=input_shape, n_classes=n_classes, verbose=True, fc_activation=fc_activation, using_cb=True)
def main(args): torch.cuda.set_device(args.gpu) set_seed(args) log_dir = make_log_dir(args.model_name, args.dataset, args.log_subdir) log_file = os.path.join(log_dir, "log.txt") sys.stdout = open(log_file, 'w') backup_model = f"cp -r ./models {log_dir}" os.system(backup_model) # load and preprocess dataset zinc_data = load_dataset(args) train_loader = DataLoader(zinc_data.train, batch_size=1000, shuffle=True, collate_fn=zinc_data.collate, num_workers=4) val_loader = DataLoader(zinc_data.val, batch_size=1000, shuffle=False, collate_fn=zinc_data.collate) test_loader = DataLoader(zinc_data.test, batch_size=1000, shuffle=False, collate_fn=zinc_data.collate) # placeholder of dataset dataset = (None, None, None, None, None, None, None) # create model model = get_model(dataset, args, mode='zinc').cuda() print(model) # define loss func loss_fcn = torch.nn.L1Loss() # define optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50, verbose=True) best_val_loss = sys.maxsize best_test_mae = sys.maxsize dur = [] for epoch in range(args.epochs): model.train() epoch_loss = 0 epoch_train_mae = 0 t0 = time.time() for iter, (batch_graphs, batch_targets, batch_snorm_n, batch_snorm_e) in enumerate(train_loader): batch_x = batch_graphs.ndata['feat'].cuda() # num x feat batch_e = batch_graphs.edata['feat'].cuda() batch_snorm_e = batch_snorm_e.cuda() batch_targets = batch_targets.cuda() batch_snorm_n = batch_snorm_n.cuda() # num x 1 optimizer.zero_grad() model.g = batch_graphs batch_scores = model.forward(batch_x, batch_e, batch_snorm_n, batch_snorm_e) loss = loss_fcn(batch_scores, batch_targets) if args.model_name == "FactorGNN" and args.dis_weight > 0.0: losses = model.compute_disentangle_loss() dis_loss = model.merge_loss(losses) * args.dis_weight loss = loss + dis_loss loss.backward() optimizer.step() iter_loss = loss.item() iter_mae = F.l1_loss(batch_scores, batch_targets).item() epoch_loss += iter_loss epoch_train_mae += iter_mae dur.append(time.time() - t0) epoch_loss /= (iter + 1) epoch_train_mae /= (iter + 1) # print(f"loss {epoch_loss:.4f}, mae {epoch_train_mae:.4f}") val_loss, val_mae = test(model, val_loader) test_loss, test_mae = test(model, test_loader) if val_loss < best_val_loss: best_val_loss = val_loss best_test_mae = test_mae torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': best_test_mae, 'args': args }, os.path.join(log_dir, 'best_model.pt')) print(f"time {np.mean(dur):.2f} epoch {epoch:03d} | " + f"train ({epoch_loss:.4f}, {epoch_train_mae:.4f}) | " + f"val ({val_loss:.4f}, {val_mae:.4f}) | " + f"test ({test_loss:.4f}, {test_mae:.4f}) | " + f"best: {best_test_mae:.4f}") sys.stdout.flush() if optimizer.param_groups[0]['lr'] > 1e-5: scheduler.step(val_loss)
def main(): seed_everything() # yml = 'configs/base.yml' # config = utils.config.load(yml) # pprint.pprint(config, indent=2) model = get_model(config).cuda() bind_model(model) args = get_args() if args.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if args.mode == 'train': ### training mode일 때 print('Training Start...') # no bias decay if config.OPTIMIZER.NO_BIAS_DECAY: group_decay, group_no_decay = group_weight(model) params = [{'params': group_decay}, {'params': group_no_decay, 'weight_decay': 0.0}] else: params = model.parameters() optimizer = get_optimizer(config, params) optimizer.param_groups[0]['initial_lr'] = config.OPTIMIZER.LR if config.OPTIMIZER.NO_BIAS_DECAY: optimizer.param_groups[1]['initial_lr'] = config.OPTIMIZER.LR ############################################################################################### if IS_LOCAL: prepare_train_directories(config) utils.config.save_config(yml, config.LOCAL_TRAIN_DIR) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, score, loss = utils.checkpoint.load_checkpoint(config, model, checkpoint) else: print('[*] no checkpoint found') last_epoch, score, loss = -1, -1, float('inf') print('last epoch:{} score:{:.4f} loss:{:.4f}'.format(last_epoch, score, loss)) else: last_epoch = -1 ############################################################################################### scheduler = get_scheduler(config, optimizer, last_epoch=last_epoch) ############################################################################################### if IS_LOCAL: if config.SCHEDULER.NAME == 'multi_step': if config.SCHEDULER.WARMUP: scheduler_dict = scheduler.state_dict()['after_scheduler'].state_dict() else: scheduler_dict = scheduler.state_dict() milestones = scheduler_dict['milestones'] step_count = len([i for i in milestones if i < last_epoch]) optimizer.param_groups[0]['lr'] *= scheduler_dict['gamma'] ** step_count if config.OPTIMIZER.NO_BIAS_DECAY: optimizer.param_groups[1]['initial_lr'] *= scheduler_dict['gamma'] ** step_count if last_epoch != -1: scheduler.step() ############################################################################################### # for dirname, _, filenames in os.walk(DATASET_PATH): # for filename in filenames: # print(os.path.join(dirname, filename)) # if preprocessing possible preprocess_type = config.DATA.PREPROCESS cv2_size = (config.DATA.IMG_W, config.DATA.IMG_H) if not IS_LOCAL: preprocess(os.path.join(DATASET_PATH, 'train', 'train_data', 'NOR'), os.path.join(preprocess_type, 'NOR'), preprocess_type, cv2_size) preprocess(os.path.join(DATASET_PATH, 'train', 'train_data', 'AMD'), os.path.join(preprocess_type, 'AMD'), preprocess_type, cv2_size) preprocess(os.path.join(DATASET_PATH, 'train', 'train_data', 'RVO'), os.path.join(preprocess_type, 'RVO'), preprocess_type, cv2_size) preprocess(os.path.join(DATASET_PATH, 'train', 'train_data', 'DMR'), os.path.join(preprocess_type, 'DMR'), preprocess_type, cv2_size) data_dir = preprocess_type # data_dir = os.path.join(DATASET_PATH, 'train/train_data') else: # IS_LOCAL data_dir = os.path.join(DATASET_PATH, preprocess_type) # eda # train_std(data_dir, preprocess_type, cv2_size) fold_df = split_cv(data_dir, n_splits=config.NUM_FOLDS) val_fold_idx = config.IDX_FOLD ############################################################################################### train_loader = get_dataloader(config, data_dir, fold_df, val_fold_idx, 'train', transform=Albu()) val_loader = get_dataloader(config, data_dir, fold_df, val_fold_idx, 'val') postfix = dict() num_epochs = config.TRAIN.NUM_EPOCHS val_acc_list = [] for epoch in range(last_epoch+1, num_epochs): if epoch >= config.LOSS.FINETUNE_EPOCH: criterion = get_loss(config.LOSS.FINETUNE_LOSS) else: criterion = get_loss(config.LOSS.NAME) train_values = train_single_epoch(config, model, train_loader, criterion, optimizer, scheduler, epoch) val_values = evaluate_single_epoch(config, model, val_loader, criterion, epoch) val_acc_list.append((epoch, val_values[2])) if config.SCHEDULER.NAME != 'one_cyle_lr': scheduler.step() if IS_LOCAL: utils.checkpoint.save_checkpoint(config, model, epoch, val_values[1], val_values[0]) else: postfix['train_loss'] = train_values[0] postfix['train_res'] = train_values[1] postfix['train_acc'] = train_values[2] postfix['train_sens'] = train_values[3] postfix['train_spec'] = train_values[4] postfix['val_loss'] = val_values[0] postfix['val_res'] = val_values[1] postfix['val_acc'] = val_values[2] postfix['val_sens'] = val_values[3] postfix['val_spec'] = val_values[4] nsml.report(**postfix, summary=True, step=epoch) val_res = '%.10f' % val_values[1] val_res = val_res.replace(".", "") val_res = val_res[:4] + '.' + val_res[4:8] + '.' + val_res[8:] save_name = 'epoch_%02d_score%s_loss%.4f.pth' % (epoch, val_res, val_values[0]) # nsml.save(save_name) nsml.save(epoch) for e, val_acc in val_acc_list: print('%02d %s' % (e, val_acc))
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = get_args() cfg = Config.fromfile(args.config) cfg.device = device train = pd.read_csv(cfg.train_csv) camera_matrix_inv = np.linalg.inv(kaggle.camera_matrix) if 0: points_df = pd.DataFrame() for col in ['x', 'y', 'z', 'yaw', 'pitch', 'roll']: arr = [] for ps in train['PredictionString']: coords = kaggle.str2coords(ps) arr += [c[col] for c in coords] points_df[col] = arr log.info(f'len(points_df): {len(points_df)}') log.info(points_df.head()) img = imread(opj(cfg.train_images, train.iloc[0]['ImageId'] + '.jpg')) # plt.figure(figsize=(15,8)) # plt.imshow(img) # plt.show() # log.info(train.head()) # log.info(kaggle.camera_matrix) pred_string = train.iloc[0]['PredictionString'] coords = kaggle.str2coords(pred_string) # log.info(coords) lens = [len(kaggle.str2coords(s)) for s in train['PredictionString']] ############ plt.figure(figsize=(15, 6)) sns.countplot(lens) # plt.xlabel('Number of cars in image') # plt.show() plt.savefig('eda/number_cars_in_image.png') ############ plt.figure(figsize=(15, 6)) sns.distplot(functools.reduce(lambda a, b: a + b, [[c['x'] for c in kaggle.str2coords(s)] for s in train['PredictionString']]), bins=500) # sns.distplot([kaggle.str2coords(s)[0]['x'] for s in train['PredictionString']]); plt.xlabel('x') # plt.show() plt.savefig('eda/x.png') ############ plt.figure(figsize=(15, 6)) sns.distplot(functools.reduce(lambda a, b: a + b, [[c['y'] for c in kaggle.str2coords(s)] for s in train['PredictionString']]), bins=500) plt.xlabel('y') # plt.show() plt.savefig('eda/y.png') ############ plt.figure(figsize=(15, 6)) sns.distplot(functools.reduce(lambda a, b: a + b, [[c['z'] for c in kaggle.str2coords(s)] for s in train['PredictionString']]), bins=500) plt.xlabel('z') # plt.show() plt.savefig('eda/z.png') ############ plt.figure(figsize=(15, 6)) sns.distplot( functools.reduce(lambda a, b: a + b, [[c['yaw'] for c in kaggle.str2coords(s)] for s in train['PredictionString']])) plt.xlabel('yaw') # plt.show() plt.savefig('eda/yaw.png') ############ plt.figure(figsize=(15, 6)) sns.distplot( functools.reduce(lambda a, b: a + b, [[c['roll'] for c in kaggle.str2coords(s)] for s in train['PredictionString']])) plt.xlabel('roll') # plt.show() plt.savefig('eda/roll.png') ############ plt.figure(figsize=(15, 6)) sns.distplot( functools.reduce(lambda a, b: a + b, [[c['pitch'] for c in kaggle.str2coords(s)] for s in train['PredictionString']])) plt.xlabel('pitch') # plt.show() plt.savefig('eda/pitch.png') ############ plt.figure(figsize=(15, 6)) sns.distplot( functools.reduce(lambda a, b: a + b, [[ kaggle.rotate(c['roll'], np.pi) for c in kaggle.str2coords(s) ] for s in train['PredictionString']])) plt.xlabel('roll rotated by pi') # plt.show() plt.savefig('eda/roll_rotated_by_pi.png') plt.figure(figsize=(14, 14)) plt.imshow( imread(opj(cfg.train_images, train.iloc[2217]['ImageId'] + '.jpg'))) plt.scatter(*kaggle.get_img_coords( train.iloc[2217]['PredictionString']), color='red', s=100) # plt.show() # log.info(kaggle.get_img_coords(train.iloc[2217]['PredictionString'])) ############ xs, ys = [], [] for ps in train['PredictionString']: x, y = kaggle.get_img_coords(ps) xs += list(x) ys += list(y) plt.figure(figsize=(18, 18)) plt.imshow(imread( opj(cfg.train_images, train.iloc[2217]['ImageId'] + '.jpg')), alpha=0.3) plt.scatter(xs, ys, color='red', s=10, alpha=0.2) # plt.show() plt.savefig('eda/xs-ys_distribution.png') ############ # view distribution from the sky road_width = 3 road_xs = [ -road_width, road_width, road_width, -road_width, -road_width ] road_ys = [0, 0, 500, 500, 0] plt.figure(figsize=(16, 16)) plt.axes().set_aspect(1) plt.xlim(-50, 50) plt.ylim(0, 100) # View road plt.fill(road_xs, road_ys, alpha=0.2, color='gray') plt.plot([road_width / 2, road_width / 2], [0, 100], alpha=0.4, linewidth=4, color='white', ls='--') plt.plot([-road_width / 2, -road_width / 2], [0, 100], alpha=0.4, linewidth=4, color='white', ls='--') # View cars # plt.scatter(points_df['x'], np.sqrt(points_df['z']**2 + points_df['y']**2), color='red', s=10, alpha=0.1) # plt.savefig('eda/view_from_sky.png') ############ fig = px.scatter_3d(points_df, x='x', y='y', z='z', color='pitch', range_x=(-50, 50), range_y=(0, 50), range_z=(0, 250), opacity=0.1) # fig.show() zy_slope = LinearRegression() X = points_df[['z']] y = points_df[['y']] zy_slope.fit(X, y) print('MAE without x:', mean_absolute_error(y, zy_slope.predict(X))) # Will use this model later xzy_slope = LinearRegression() X = points_df[['x', 'z']] y = points_df['y'] xzy_slope.fit(X, y) print('MAE with x:', mean_absolute_error(y, xzy_slope.predict(X))) print('\ndy/dx = {:.3f} \ndy/dz = {:.3f}'.format(*xzy_slope.coef_)) plt.figure(figsize=(16, 16)) plt.xlim(0, 500) plt.ylim(0, 100) plt.scatter(points_df['z'], points_df['y'], label='Real points') X_line = np.linspace(0, 500, 10) plt.plot(X_line, zy_slope.predict(X_line.reshape(-1, 1)), color='orange', label='Regression') plt.legend() plt.xlabel('z coordinate') plt.ylabel('y coordinate') plt.savefig('eda/linear_regression.png') # 3d view n_rows = 6 for idx in range(n_rows): fig, axes = plt.subplots(1, 2, figsize=(20, 20)) img = imread( opj(cfg.train_images, train['ImageId'].iloc[idx] + '.jpg')) axes[0].imshow(img) img_vis = kaggle.visualize( img, kaggle.str2coords(train['PredictionString'].iloc[idx])) axes[1].imshow(img_vis) # plt.show() plt.savefig(f'eda/img-view_coords_{idx}.png') if 0: img0 = imread(opj(cfg.train_images, train.iloc[0]['ImageId'] + '.jpg')) img = kaggle.preprocess_image(img0) print(train.iloc[0]['PredictionString']) mask, regr = kaggle.get_mask_and_regr( img0, train.iloc[0]['PredictionString']) # print('img.shape', img.shape, 'std:', np.std(img)) # print('mask.shape', mask.shape, 'std:', np.std(mask)) # print('regr.shape', regr.shape, 'std:', np.std(regr)) plt.figure(figsize=(16, 16)) plt.title('Processed image') plt.imshow(img) # plt.show() plt.savefig('eda/processed_image.png') plt.figure(figsize=(16, 16)) plt.title('Detection Mask') plt.imshow(mask) # plt.show() plt.savefig('eda/detection_mask.png') plt.figure(figsize=(16, 16)) plt.title('Yaw values') plt.imshow(regr[:, :, -2]) # plt.show() plt.savefig('eda/yaw_values.png') ############# if 0: regr_model = kaggle.get_regr_model(train) for idx in range(2): fig, axes = plt.subplots(1, 2, figsize=(20, 20)) for ax_i in range(2): img0 = imread( opj(cfg.train_images, train['ImageId'].iloc[idx] + '.jpg')) if ax_i == 1: img0 = img0[:, ::-1] img = kaggle.preprocess_image(img0, ax_i == 1) mask, regr = kaggle.get_mask_and_regr( img0, train['PredictionString'][idx], ax_i == 1) regr = np.rollaxis(regr, 2, 0) coords = kaggle.extract_coords( np.concatenate([mask[None], regr], 0), regr_model, ax_i == 1) axes[ax_i].set_title('Flip = {}'.format(ax_i == 1)) axes[ax_i].imshow(kaggle.visualize(img0, coords)) # plt.show() plt.savefig(f'eda/{idx}_{ax_i}.png') if 0: dataset = dataset_factory.CarDataset(cfg.data.train) img, mask, regr = dataset[0] plt.figure(figsize=(16, 16)) plt.imshow(np.rollaxis(img, 0, 3)) # plt.show() plt.savefig(f'eda/img.png') plt.figure(figsize=(16, 16)) plt.imshow(mask) # plt.show() plt.savefig(f'eda/mask.png') plt.figure(figsize=(16, 16)) plt.imshow(regr[:, :, -2]) # plt.show() plt.savefig(f'eda/regr.png') ######### if 1: # initial ----------------------------------- best = { 'loss': float('inf'), 'score': 0.0, 'epoch': -1, } train_loader = dataset_factory.get_dataloader(cfg.data.train) valid_loader = dataset_factory.get_dataloader(cfg.data.valid) test_loader = dataset_factory.get_dataloader(cfg.data.test) for i, (img, mask, regr) in enumerate(tqdm(test_loader)): print(i) if i == 3: break model = model_factory.get_model(cfg) optimizer = optimizer_factory.get_optimizer(model, cfg) scheduler = scheduler_factory.get_scheduler(cfg, optimizer, best['epoch'])
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model print("=> creating model '{}'".format(args.model)) if 'sn' in args.model: model = model_factory.get_model( args.model, using_moving_average=args.using_moving_average) else: model = model_factory.get_model(args.model) if not args.distributed: if args.model.startswith('alexnet') or args.model.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # auto resume from a checkpoint model_dir = args.model_dir start_epoch = 0 if not os.path.exists(model_dir): os.makedirs(model_dir) if args.evaluate: util.load_state_ckpt(args.checkpoint_path, model) else: best_prec1, start_epoch = util.load_state(model_dir, model, optimizer=optimizer) writer = SummaryWriter(model_dir) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, 0, writer) return train_dataset_multi_scale = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ColorAugmentation(), normalize, ])) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ColorAugmentation(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader_multi_scale = torch.utils.data.DataLoader( train_dataset_multi_scale, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) if not args.using_moving_average: train_dataset_snhelper = datasets.ImageFolder( traindir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) train_loader_snhelper = torch.utils.data.DataLoader( train_dataset_snhelper, batch_size=args.batch_size * torch.cuda.device_count(), shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) for epoch in range(start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch if epoch < 95: train(train_loader_multi_scale, model, criterion, optimizer, epoch, writer) else: train(train_loader, model, criterion, optimizer, epoch, writer) if not args.using_moving_average: sn_helper(train_loader_snhelper, model) # evaluate on validation set prec1 = validate(val_loader, model, criterion, epoch, writer) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) util.save_checkpoint( model_dir, { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE, num_workers=WORKERS, pin_memory=True) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader model = get_model('se_resnext50') optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=10, verbose=True) # the only tricky part n_epochs = 120 # logdir = "/tmp/runs/" logdir = "/tmp/runs_se_resnext50/" callbacks = collections.OrderedDict() callbacks['f1_score'] = F1ScoreCallback()
def main(args): torch.cuda.set_device(args.gpu) set_seed(args) if args.log_subdir != "": log_dir = make_log_dir(args.model_name, args.dataset, args.log_subdir) log_file = os.path.join(log_dir, "log.txt") sys.stdout = open(log_file, 'w') backup_model = f"cp -r ./models {log_dir}" os.system(backup_model) # load and preprocess dataset train_loader, val_loader = load_gin_dataset(args) # num_feats = features.shape[1] # n_classes = torch.max(labels).item() + 1) # create model sample = next(iter(train_loader)) n_class_dict = {'MUTAG': 2, "IMDBBINARY": 2, "COLLAB": 3} in_dim = sample[0].ndata['attr'].shape[1] if args.dataset == "IMDBBINARY": in_dim = 150 if args.dataset == "COLLAB": in_dim = 500 feat = torch.ones(1, in_dim) dataset = (None, feat, torch.tensor([n_class_dict[args.dataset] - 1]), None, None, None, None) model = get_model(dataset, args).cuda() # define loss func loss_fcn = torch.nn.CrossEntropyLoss() # define optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.5) dur = [] best_acc = 0 for epoch in range(args.epochs): total_loss = [] # total_edges = 0 # total_nodes = 0 for step, (g, labels) in enumerate(train_loader): # total_nodes += g.number_of_nodes() # total_edges += g.number_of_edges() # continue model.train() # update the new graph model.g = g # print(max(g.in_degrees())) t0 = time.time() if args.dataset in ["IMDBBINARY", "COLLAB"]: if args.dataset == "IMDBBINARY": in_dim = 150 if args.dataset == "COLLAB": in_dim = 500 y = g.in_degrees().long().unsqueeze(-1) y_onehot = torch.FloatTensor(g.number_of_nodes(), in_dim) y_onehot.zero_() y_onehot.scatter_(1, y, 1) features = y_onehot.float().cuda() else: features = g.ndata['attr'].float().cuda() labels = labels.cuda() logits = model(features) #.view(-1, n_class, n_latent) loss = loss_fcn(logits, labels) if args.model_name == 'FactorGNN' and args.dis_weight > 0.0: losses = model.compute_disentangle_loss() dis_loss = model.merge_loss(losses) * args.dis_weight loss = loss + dis_loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss.append(loss.item()) dur.append(time.time() - t0) loss, acc = eval_net(args, model, train_loader, loss_fcn) val_loss, val_acc = eval_net(args, model, val_loader, loss_fcn) if val_acc > best_acc: best_acc = val_acc if args.log_subdir != "": torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': best_acc, 'args': args }, os.path.join(log_dir, 'best_model.pt')) print( f"epoch {epoch:03d} | train_loss {np.mean(total_loss):.3f} | train acc {acc:.3f} | val acc {val_acc:.3f} | best {best_acc:.3f}" ) if args.log_subdir != "": sys.stdout.flush() scheduler.step()
def make_model(model_name: str, hypers: HyperParameters, save_folder: str) -> Model: model = get_model(hypers, save_folder, is_train=False) model.restore(name=model_name, is_train=False, is_frozen=False) return model
def forward_model(best_model, method): args = best_model['args'] torch.cuda.set_device(args.gpu) set_seed(args) # load and preprocess dataset all_data = load_dataset(args) training = all_data[:int(len(all_data) * 0.7)] validation = all_data[int(len(all_data) * 0.7):int(len(all_data) * 0.8)] testing = all_data[int(len(all_data) * 0.8):] train_loader = DataLoader(training, batch_size=1000, shuffle=True, collate_fn=collate) val_loader = DataLoader(validation, batch_size=1000, shuffle=True, collate_fn=collate) test_loader = DataLoader(testing, batch_size=4000, shuffle=False, collate_fn=collate) dataset = (None, np.zeros((15, 15)), np.zeros( (1, args.num_factors)), None, None, None, None) # create model model = get_model(dataset, args, mode='multilabel').cuda() g, labels, gt_adjs = next(iter(test_loader)) model.load_state_dict(best_model['model_state_dict']) model.eval() # update the new graph model.g = g features = g.ndata['feat'].float().cuda() labels = labels.cuda() logits = model(features) #.view(-1, n_class, n_latent) factors = model.get_factor() batch_g = factors[0] unbatch_g = dgl.unbatch(batch_g) ged_ins = compute_GED() total_ged = [] total_factor_map = collections.defaultdict(list) sample_n = 0 for gt_list, pred_g in tqdm.tqdm(zip(gt_adjs, unbatch_g)): # dgl graph to adj pred_list = generate_adj_factor_graph(pred_g) ged, factor_map = ged_ins.hungarian_match(gt_list, pred_list, sample_n) for edge_id in factor_map.keys(): total_factor_map[ edge_id] = total_factor_map[edge_id] + factor_map[edge_id] total_ged.append(ged / len(gt_list)) sample_n += 1 c_score = compute_consistant(total_factor_map) print( f" c_score {c_score:.3f} | ged: {np.mean(total_ged):.3f} $\pm$ {np.std(total_ged):.3f}" )
train_data = get_dataset(args, args.train_img_transformers, args.train_tnsr_transformers, 'train') dev_data = get_dataset(args, args.test_img_transformers, args.test_tnsr_transformers, 'dev') if args.test_phase: if not args.train_phase: print("\nLoading dev data...") dev_data = get_dataset(args, args.test_img_transformers, args.test_tnsr_transformers, 'dev') print("\nLoading test data...") test_data = get_dataset(args, args.test_img_transformers, args.test_tnsr_transformers, 'test') if args.forward_thru_convs: # Get model output dimensions, before classification args.rolled_size = get_rolled_out_size(args) # Load model model, optimizer = get_model(args) if args.snapshot_path is not None: try: model, optimizer, args.lr, args.epoch_stats = load_model(args.snapshot_path, model, optimizer, args) except: print("\n Error loading snapshot...Starting run from scratch.") else: args.epoch_stats = None print(model) args.run_time = time.strftime(RESULTS_DATE_FORMAT, time.localtime()) # Train model and get statistics model_stats = {} if args.train_phase: print("\nBeginning Training Phase:")
def main(args): torch.cuda.set_device(args.gpu) set_seed(args) log_dir = make_log_dir(args.model_name, args.dataset, args.log_subdir) log_file = os.path.join(log_dir, "log.txt") sys.stdout = open(log_file, 'w') backup_model = f"cp -r ./models {log_dir}" os.system(backup_model) # load and preprocess dataset all_data = load_dataset(args) training = all_data[:int(len(all_data) * 0.7)] validation = all_data[int(len(all_data) * 0.7):int(len(all_data) * 0.8)] testing = all_data[int(len(all_data) * 0.8):] train_loader = DataLoader(training, batch_size=1000, shuffle=True, collate_fn=collate) val_loader = DataLoader(validation, batch_size=1000, shuffle=True, collate_fn=collate) test_loader = DataLoader(testing, batch_size=1000, shuffle=False, collate_fn=collate) dataset = (None, np.zeros((15, 15)), np.zeros( (1, args.num_factors)), None, None, None, None) # create model model = get_model(dataset, args, mode='multilabel').cuda() print(model) # define loss func loss_fcn = torch.nn.BCEWithLogitsLoss() # define optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) best_val_f1 = 0 best_test_f1 = 0 dur = [] for epoch in range(args.epochs): for step, (g, labels, gt_adjs) in enumerate(train_loader): model.train() # update the new graph model.g = g t0 = time.time() features = g.ndata['feat'].float().cuda() labels = labels.cuda() logits = model(features) #.view(-1, n_class, n_latent) loss = loss_fcn(logits, labels) if args.model_name == 'FactorGNN' and args.dis_weight > 0.0: losses = model.compute_disentangle_loss() dis_loss = model.merge_loss(losses) * args.dis_weight loss = loss + dis_loss optimizer.zero_grad() loss.backward() optimizer.step() dur.append(time.time() - t0) val_micro_f1 = test(model, val_loader) test_micro_f1 = test(model, val_loader) if val_micro_f1 > best_val_f1: best_val_f1 = val_micro_f1 best_test_f1 = test_micro_f1 torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': best_test_f1, 'args': args }, os.path.join(log_dir, 'best_model.pt')) print(f"time {np.mean(dur):.2f} epoch {epoch:03d} | " + f"val ({val_micro_f1:.4f}) | " + f"test ({test_micro_f1:.4f}) | " + f"best: {best_test_f1:.4f}") sys.stdout.flush()