def wrapper(*args, **kwargs): from src.utils.check import check_all check_all() import src.utils.logger as log log.init() func(*args, **kwargs) log.shutdown()
def setUp(self) -> None: logger.init()
def train_and_evaluate_model(arguments): """ Main Pipeline for training and cross-validation. """ """ Setup result directory and enable logging to file in it """ logger.init(arguments['outdir'], filename_prefix='train_cls', log_level=logging.INFO) # keep logs at root dir. outdir = os.path.join(arguments['outdir'], 'train_cls') os.makedirs(outdir, exist_ok=True) logger.info('Arguments:\n{}'.format(pformat(arguments))) """ Set random seed throughout python""" utils.set_random_seed(random_seed=arguments['random_seed']) """ Create tensorboard writer """ tb_writer = initialize_tensorboard(outdir) """ Set device - cpu or gpu """ device = torch.device( arguments['cuda_device'] if torch.cuda.is_available() else "cpu") logger.info(f'Using device - {device}') """ Load parameters for the Dataset """ dataset = create_dataset(arguments['dataset_args'], arguments['train_data_args'], arguments['val_data_args']) """ Load Model with weights(if available) """ model: torch.nn.Module = models_utils.get_model( arguments.get('model_args'), device, arguments['dataset_args']).to(device) """ Create optimizer and scheduler """ optimizer = optimizer_utils.create_optimizer(model.parameters(), arguments['optimizer_args']) lr_scheduler: _LRScheduler = optimizer_utils.create_scheduler( optimizer, arguments['scheduler_args']) """ Create loss function """ logger.info(f"Loss weights {dataset.pos_neg_balance_weights()}") criterion = loss_utils.create_loss(arguments['loss_args']) """ Sample and View the inputs to model """ dataset.debug() """ Pipeline - loop over the dataset multiple times """ max_validation_acc, best_validation_model_path = 0, None batch_index = 0 nb_epochs = 1 if is_debug_mode() else arguments['nb_epochs'] for epoch in range(nb_epochs): """ Train the model """ logger.info(f"Training, Epoch {epoch + 1}/{nb_epochs}") train_dataloader = dataset.train_dataloader model.train() start = time.time() total, correct = 0, 0 epoch_loss = 0 for i, data in enumerate(tqdm(train_dataloader)): # get the inputs inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # Forward Pass outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() tb_writer.save_scalar('batch_training_loss', loss.item(), batch_index) batch_index += 1 epoch_loss += loss.item() * labels.size(0) total += labels.size(0) _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels).sum().item() optimizer.step() epoch_loss = epoch_loss / total logger.info(f"Epoch = {epoch}, Train_loss = {epoch_loss}, " f"Time taken = {time.time() - start} seconds.") logger.info(f"Train_accuracy = {100 * correct / total}") tb_writer.save_scalar('training_loss', epoch_loss, epoch) tb_writer.save_scalar('training_acc', 100 * correct / total, epoch) """ Validate the model """ val_data_args = arguments['val_data_args'] if val_data_args['validate_step_size'] > 0 and \ epoch % val_data_args['validate_step_size'] == 0: model.eval() validation_dataloader = dataset.validation_dataloader logger.info( f"Validation, Epoch {epoch + 1}/{arguments['nb_epochs']}") val_loss, val_accuracy = evaluate_single_class( device, model, validation_dataloader, criterion) logger.info(f'validation images: {dataset.val_dataset_size}, ' f'val_auc : {val_accuracy} %% ' f'val_loss: {val_loss}') tb_writer.save_scalar('validation_acc', val_accuracy, epoch) tb_writer.save_scalar('validation_loss', val_loss, epoch) """ Save Model """ if val_accuracy > max_validation_acc: max_validation_acc = val_accuracy if best_validation_model_path is not None: os.remove(best_validation_model_path) best_validation_model_path = os.path.join( outdir, f'epoch_{epoch:04}-model-val_acc_{val_accuracy}.pth') torch.save(model.state_dict(), best_validation_model_path) logger.info(f'Model saved at: {best_validation_model_path}') if lr_scheduler: prev_lr = lr_scheduler.get_last_lr() lr_scheduler.step() if lr_scheduler.get_last_lr() != prev_lr: logger.warn( f'Updated LR from {prev_lr} to {lr_scheduler.get_last_lr()}' ) logger.info('Finished Training') logger.info(f'Max Validation accuracy is {max_validation_acc}') """ Create a symbolic link to the best model at a static path 'best_model.pth' """ symlink_path = os.path.join(outdir, 'best_model.pth') if os.path.islink(symlink_path): os.unlink(symlink_path) os.symlink(best_validation_model_path.rsplit('/')[-1], symlink_path) logger.info( f'Best Model saved at: {best_validation_model_path}. and symlink to {symlink_path}' ) """ Evaluate model on test set """ model.load_state_dict(torch.load(best_validation_model_path), strict=False) test_dataloader = dataset.test_dataloader test_loss, test_accuracy = evaluate_single_class(device, model, test_dataloader, criterion) logger.info( f'Accuracy of the network on the {dataset.test_dataset_size} test images: {test_accuracy} %%' ) return test_loss, test_accuracy
def dump_saliency_data(): """ Main Pipeline for training and cross-validation. """ parser = argparse.ArgumentParser(description="config") parser.add_argument("--config", type=str, default="config/roar_cifar10_resnet8.yml", help="Configuration file to use.") args = parser.parse_args() with open(args.config) as fp: cfg = yaml.load(fp, Loader=Loader) roar_core.validate_configuration(cfg, validate_attribution_methods=True) # Common Configuration dataset_args = cfg['data'] train_data_args = dict( batch_size=4, shuffle=False, enable_augmentation=False, ) assert not train_data_args['enable_augmentation'], \ 'Augmentation of dataset should be disabled for generating dataset' val_data_args = dict( batch_size=4, shuffle=False, ) # Shuffling should be off assert not val_data_args['shuffle'] arguments = dict( dataset_args=dataset_args, train_data_args=train_data_args, val_data_args=val_data_args, model_args=cfg['extract_cams']['model'], outdir=cfg['outdir'], ) """ Setup result directory """ outdir = os.path.join(arguments.get("outdir"), 'extract_cams') logger.init(outdir, filename_prefix='extract_cams', log_level=logging.INFO) # keep logs at root dir. logger.info('Arguments:\n{}'.format(pformat(arguments))) """ Set random seed throughout python""" utils.set_random_seed(random_seed=random.randint(0, 1000)) """ Set device - cpu or gpu """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logger.info(f'Using device - {device}') """ Load parameters for the Dataset """ dataset = create_dataset(dataset_args, train_data_args, val_data_args) """ Sample and View the inputs to model """ dataset.debug() dataloaders = [dataset.train_dataloader, dataset.validation_dataloader, dataset.test_dataloader] dataset_modes = ['train', 'validation', 'test'] # Some datasets do not have same image size and thus we need to crop a certain area to compute attribution maps # Since we want to use same input image for retraining of model with different attribution maps, care must be taken # that transform applied gives same input image. Such as center crop is fine, but RandomCrop/RandomScale are not. save_input_images = True save_attribution = True assert save_input_images or save_attribution, 'Either save input images or save attribution flag should be enabled.' attribution_methods = cfg['extract_cams']['attribution_methods'] logger.info(f'Computing attribution maps for {attribution_methods}') # Save attribution maps in ./outdir/[train, validation, test]/[input/AttributionName]/Class/ImageIndex.png for attribution_method in attribution_methods: for dataloader, dataset_mode in zip(dataloaders, dataset_modes): """ Load Model with weights(if available) """ model: torch.nn.Module = model_utils.get_model( arguments.get('model_args'), device, arguments['dataset_args'] ).to(device) """ Create cropped dataset and attributions directories """ # Need to save cropped dataset once - Although its is not optimum to have another copy of a dataset, we # still preferred this due to simpler design of having parallel attribution and images dataset. if save_input_images: # The Classification Dataset(CIFAR/Birdsnap) are written in torchvision.datasets.ImageFolder format. images_output_dirs = [os.path.join(outdir, f'{dataset_mode}/input/', str(cls)) for cls in dataset.classes] [os.makedirs(dir, exist_ok=True) for dir in images_output_dirs] counter = 0 # Create labelled attribution folder attribution_output_dirs = [os.path.join(outdir, f'{dataset_mode}/{attribution_method["name"]}', str(cls)) for cls in dataset.classes] [os.makedirs(dir, exist_ok=True) for dir in attribution_output_dirs] counters = defaultdict(int) """ Thank god, finally let CAM extraction begin """ logger.info(f"Generating images and attribution for {attribution_method} in {dataset_mode} split.") model.eval() for i, data in enumerate(tqdm(dataloader)): inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) _, max_prob_indices = torch.max(outputs.data, 1) # ToDo: Add support for method that can compute attributions for batch # TODo - Check width and height for preprocessed_image, max_prob_index, label in zip(inputs, max_prob_indices, labels): """ Save CAMS and input images.""" if save_input_images: # Denormalize the image and save rgb_image = dataset.denormalization_transform(preprocessed_image.cpu()) rgb_image = (torch.clamp(rgb_image, 0.0, 1.0).numpy() * 255.0).astype('uint8') skimage.io.imsave(f'{images_output_dirs[label]}/' f'{str(counters[label.item()]).zfill(5)}.png', rgb_image.transpose(1, 2, 0), check_contrast=False) if save_attribution: attribution_map = attribution_loader.generate_attribution( model, preprocessed_image.unsqueeze(0), max_prob_index, attribution_method ) # Save in attribution_output_dir as a uint8 image percentiled_image = convert_float_to_percentiled_3channel_image(attribution_map) skimage.io.imsave(f'{attribution_output_dirs[label]}/' f'{str(counters[label.item()]).zfill(5)}.png', percentiled_image.transpose(1, 2, 0), check_contrast=False) counters[label.item()] += 1 save_input_images = False # No need to resave input images for next attribution method
from elasticsearch import Elasticsearch from elasticsearch import helpers import jsonlines from pathlib import Path import gzip import src.utils.logger as logger import re import pandas as pd log = logger.init("./log/create_corpus.log") data_path = "./semanticscholar" processed = pd.read_csv("./log/create_corpus.log", sep=' ', header=None)[9] processed = [re.sub('semanticscholar/', '', obj) for obj in processed] def doc_generator(reader): for doc in reader.iter(type=dict, skip_invalid=True): author_names = [] author_ids = [] for obj in doc.get('authors'): author_ids.extend(obj.get('ids')) author_names.append(obj.get('name')) yield { "_index": 'semanticscholar', "_type": "document", "_id": doc.get('id'), "title": doc.get('title'), "paperAbstract": doc.get("paperAbstract"), "entities": doc.get("entities"),