Exemplo n.º 1
0
def load_data(data_dir):
    """
    Function that takes a folder, finds all .jpg files inside the folder,
    and creates a dataframe.
    """
    # Reproducibility
    myutils.myseed(seed=42)

    # Get the image paths
    filenames = myutils.run_fast_scandir(data_dir, [".jpg"])
    df = pd.DataFrame(data=filenames, columns=['filenames'])

    # Get the label from nth folder starting from the parent:
    outlevel = 4  # fname = '/scratch/s181423_data/data_bin/label/image.jpg'
    df['label'] = df['filenames'].apply(lambda x: x.split('/')[outlevel])

    # Get the id from the basename
    df['id'] = df['filenames'].apply(lambda x: os.path.basename(x))

    # Get label as one hot encoded values
    df = df.set_index(['id', 'filenames'])
    df['label'] = df['label'].astype('category')
    df = pd.get_dummies(df, prefix='', prefix_sep='')
    df = df.reset_index()

    # Save the data as a .csv file
    df.to_csv(f'{data_dir}.csv', index=False)
    logging_data_process.info(f'Saved: {data_dir}.csv')
Exemplo n.º 2
0
def get_loaders(dfs, size=100, batch_size=1, num_workers=1):
    """
    Function that takes a dictionary of dataframes and
    returns 2 dictionaries of pytorch dataloaders and dataset_sizes
    """
    # Reproducibility
    myutils.myseed(seed=42)

    # Custom pytorch dataloader for this dataset
    class Derm(Dataset):
        """
        Read a pandas dataframe with
        images paths and labels
        """
        def __init__(self, df, transform=None):
            self.df = df
            self.transform = transform

        def __len__(self):
            return len(self.df)

        def __getitem__(self, index):
            try:
                # Load image data and get label
                X = Image.open(self.df['image_path'][index]).convert('RGB')
                y = torch.tensor(int(self.df['label_code'][index]))
                print(f"{self.df['image_path'][index]}\t{self.df['label_code'][index]}")
            except IOError as err:
                pass

            if self.transform:
                X = self.transform(X)

            return X, y

    # ImageNet statistics
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    # Transforms
    data_transforms = {'train' : transforms.Compose([transforms.Resize(size),
                                              transforms.CenterCrop((size,size)),
                                              transforms.RandomHorizontalFlip(),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean,std)]),
                       'val' : transforms.Compose([transforms.Resize(size),
                                              transforms.CenterCrop((size,size)),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean,std)])}

    # Sets
    image_datasets = {x: Derm(dfs[x], transform=data_transforms[x]) for x in dfs.keys()}
    # Sizes
    dataset_sizes = {x: len(image_datasets[x]) for x in dfs.keys()}
    # Loaders
    dataloaders = {x: DataLoader(image_datasets[x], batch_size, num_workers) for x in dfs.keys()}

    return dataloaders, dataset_sizes
def get_all_weights(data_dir, folds):
    """
    Function that creates a dictionary with the RGB means and stds.
    """
    # Reproducibility
    myutils.myseed(seed=42)
    weights_dict = {}
    for fold in range(1, folds+1):
        path = os.path.join(data_dir, f'train{fold}.csv')
        train = pd.read_csv(path)
        weights = myutils.get_weights(train)
        weights_dict[f'weights{fold}'] = weights

    # Save stats_dict to a .json file
    with open('dicts/weights_dict.json', 'w') as f:
        f.write(json.dumps(weights_dict))
        logging_data_process.info('Saved: dicts/weights_dict.json')
def load_data(data_dir):
    """
    Function that takes a folder, finds all .jpg files inside the folder,
    and creates a dataframe.
    """
    # Reproducibility
    myutils.myseed(seed=42)

    # Get the image paths
    filenames = myutils.run_fast_scandir(data_dir, [".jpg"])
    df = pd.DataFrame(data=filenames, columns=['filenames'])

    # Get the label from nth folder starting from the parent:
    outlevel = 4  # fname = '/scratch/s181423/data/label/image.jpg'
    df['label'] = df['filenames'].apply(lambda x: x.split('/')[outlevel])

    # Resample the minority classes to have them in training and testing
    counts_df = pd.DataFrame(df['label'].value_counts())
    labels_with_one_example = list(counts_df[counts_df['label'] < 2].index)
    duplicates_df = df[df['label'].isin(labels_with_one_example)]
    # 5-plicate df for stratify 20% of 5 is 1, for 80% train, 20% test
    df_copy = duplicates_df
    df = pd.concat([df, df_copy, df_copy, df_copy, df_copy])

    # Get the id from the basename
    df['id'] = df['filenames'].apply(lambda x: os.path.basename(x))

    # Get label as one hot encoded values
    df = df.set_index(['id','filenames'])
    df['label'] = df['label'].astype('category')
    mapping = dict(enumerate(df['label'].cat.categories ))
    df['label'] = pd.Categorical(df['label']).codes
    #df = pd.get_dummies(df, prefix='', prefix_sep='')
    df = df.reset_index()

    # Save the data as a .csv file
    df.to_csv(f'{data_dir}.csv', index=False)
    logging_data_process.info(f'Saved: {data_dir}.csv')

    # Save the mappings as a .json file
    with open('dicts/mapping.json', 'w') as f:
        f.write(json.dumps(mapping))
        logging_data_process.info('Saved: dicts/mapping.json')
Exemplo n.º 5
0
def load_data(data_dir):
    """
    Function that takes a folder, finds all .jpg files inside the folder,
    and creates a dataframe.
    """
    # Reproducibility
    myutils.myseed(seed=42)

    # Get the image paths
    filenames = myutils.run_fast_scandir(data_dir, [".jpg"])
    df1 = pd.DataFrame(data=filenames, columns=['image_path'])
    df1['image_id'] = df1['image_path'].apply(
        lambda x: os.path.splitext(os.path.basename(x))[0])
    df1 = df1.set_index('image_id')

    # Get the labels
    fname = os.path.join(data_dir, 'labels.csv')
    df2 = pd.read_csv(fname)
    df2 = df2.set_index('image')

    # Do not move function from here
    def get_disease(row):
        for c in df2.columns:
            if row[c] == 1:
                return c

    df2 = df2.apply(get_disease, axis=1).to_frame(name='label')
    df = pd.merge(df1, df2, left_index=True, right_index=True)
    df['label'] = df['label'].astype('category')
    mapping = dict(enumerate(df['label'].cat.categories))
    df['label_code'] = pd.Categorical(df['label']).codes

    # Save the data as a .csv file
    df.to_csv(f'{data_dir}.csv', index=False)
    logging_data_process.info(f'Saved: {data_dir}.csv')

    # Save the mapping as a .json file
    with open(f'{data_dir}.json', 'w') as f:
        f.write(json.dumps(mapping))
        logging_data_process.info(f'Saved: {data_dir}.json')
Exemplo n.º 6
0
def eval(file, dataloaders, dataset_sizes, net):
    """
    Evaluate a net.
    """
    # Reproducibility
    myutils.myseed(seed=42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load network and restore settings from .tar file
    net = net.to(device)
    fname = f'{args.restore_file}.tar'
    restore_path = os.path.join(args.model_dir, fname)
    checkpoint = torch.load(restore_path)
    net.load_state_dict(checkpoint['net_state_dict'])
    net.eval()

    # Validation phase
    phase = 'val'
    with torch.no_grad():
        #indexes, predictions, probabilities, all_probabilities, in_labels, in_targets = [],[],[],[],[],[]
        indexes, predictions, probabilities, all_probabilities, in_labels = [],[],[],[],[]
        for index, inputs, labels in tqdm(dataloaders[phase]):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = net(inputs)

            #_, targets = torch.max(labels, 1)
            probs, preds = torch.max(outputs, 1)

            indexes.extend(index.cpu().detach().numpy())
            all_probabilities.extend(outputs.cpu().detach().numpy())
            probabilities.extend(probs.cpu().detach().numpy())
            predictions.extend(preds.cpu().detach().numpy())
            in_labels.extend(labels.cpu().detach().numpy())
            #in_targets.extend(targets.cpu().detach().numpy())

    #return indexes, probabilities, predictions, all_probabilities, in_labels, in_targets
    return indexes, probabilities, predictions, all_probabilities, in_labels
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer,
               scheduler, net_name, num_epochs):
    """
    Train and evaluate a net.
    """
    # Initialize logs
    fname = os.path.join(args.model_dir, f'train{fold}.log')
    logging_train = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'lr{fold}.log')
    logging_lr = myutils.setup_logger(fname)
    # Reproducibility
    myutils.myseed(seed=42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load initial weights
    net = net.to(device)
    #best_net_wts = copy.deepcopy(net.state_dict())
    best_acc, epoch = 0.0, 1

    # Initialize .tar files to save settings
    fname = f'last{fold}.tar'
    last_path = os.path.join(args.model_dir, fname)
    fname = f'best{fold}.tar'
    best_path = os.path.join(args.model_dir, fname)

    # To resume training for more epochs
    if args.resume:
        try:
            # Load last settings from .tar file
            last_checkpoint = torch.load(last_path)
            net.load_state_dict(last_checkpoint['net_state_dict'])
            optimizer.load_state_dict(last_checkpoint['optimizer_state_dict'])
            epoch = last_checkpoint[
                'epoch'] + 1  # Since last epoch was saved we start with the next one
            logging_process.info(
                f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}'
            )

            # Load best settings from .tar file
            best_checkpoint = torch.load(best_path)
            #best_net_wts = best_checkpoint['net_state_dict']
            best_acc = best_checkpoint['acc']

        except FileNotFoundError as err:
            # This error happens when folds are present
            # If interrupted on fold 1 then best best_checkpoint for fold 2 does
            # not exists. This is fixed like this.
            logging_process.info(f'Model: {args.model_dir}\tError: {err}')

    # TRAINING LOOP
    for epoch in range(epoch, num_epochs + 1):

        print(f'Epoch {epoch}/{num_epochs}')
        logging_train.info(f'Epoch {epoch}/{num_epochs}')

        # Each epoch has a training phase and a validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # Set net to training mode
                mylr_value = optimizer.param_groups[0]['lr']
                logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}')
            else:
                net.eval()  # Set net to evaluate mode

            # Track statistics
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for index, inputs, labels in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, targets = torch.max(labels, 1)
                    _, preds = torch.max(outputs, 1)

                    #if net_name.startswith('vgg16_ft_no_soft'):
                    #    outputs = torch.reshape(outputs, (-1,)) # reshape added for binary
                    #    loss = criterion(outputs, targets.float()) # float added for binary

                    #else:
                    loss = criterion(outputs, targets)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Batch statistics
                running_loss += loss.detach().item() * inputs.size(
                    0)  # This is batch loss
                running_corrects += torch.sum(
                    preds == targets.data)  # This is batch accuracy

            # efficientnetb
            if net_name.startswith('efficientnetb'):
                if phase == 'train':
                    scheduler.step()

            # inceptionv
            if net_name.startswith('inceptionv'):
                if phase == 'train':
                    if (epoch % 2) == 0:
                        scheduler.step()

            # Epoch statistics
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))
            logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'val':

                # Save last settings to .tar file
                torch.save(
                    {
                        'epoch': epoch,
                        'net_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': epoch_loss
                    }, last_path)

                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    #best_net_wts = net.state_dict()

                    # Save best settings to .tar file
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': net.state_dict(),  #best_net_wts
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': epoch_loss,
                            'acc': best_acc
                        },
                        best_path)

                    # Save best settings to .json file
                    best_metrics = {
                        f'loss{fold}': epoch_loss,
                        f'acc{fold}': best_acc.item()
                    }
                    fname = os.path.join(args.model_dir, f'metrics{fold}.json')
                    with open(fname, 'w') as f:
                        f.write(json.dumps(best_metrics))

                # vgg
                if net_name.startswith('vgg'):
                    scheduler.step(epoch_acc)

                # resnet
                if net_name.startswith('resnet'):
                    scheduler.step(epoch_loss)

    print('Best val Acc: {:4f}'.format(best_acc))
    logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(
        args.model_dir, fold, best_acc))
Exemplo n.º 8
0
def data_split(data_dir, folds):
    """
    Function that takes a data_dir and a number of folds,
    and splits images in data_dir into
    training(80%) and testing(20%) data.

    For fit.py training data is further splitted into
    training and validation sets.

    If cross validation is needed, training data is also splitted into
    train and validation folds.
    """
    # Reproducibility
    myutils.myseed(seed=42)
    seed = 42

    # Load the data with image paths and labels
    df = pd.read_csv(f'{data_dir}.csv')
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    logging_data_process.info(f"all data size:{len(df)}")

    # Test
    train_val, test = train_test_split(df,
                                       test_size=0.2,
                                       random_state=seed,
                                       shuffle=True)
    train_val, test = train_val.reset_index(drop=True), test.reset_index(
        drop=True)
    test.to_csv(os.path.join(data_dir, 'test.csv'), index=False)
    logging_data_process.info(f"test size:{len(test)}")
    logging_data_process.info(f"train_val size:{len(train_val)}")
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}")

    # Train and validation
    #X = train_val[['id','filenames']]
    #y = train_val.iloc[:,2:].apply(lambda x: np.argmax(x), axis=1)  # argmax is necessary for stratification
    # categories did not allow stratify because some classes have just 1 example
    train, val = train_test_split(train_val,
                                  test_size=0.2,
                                  random_state=seed,
                                  shuffle=True)
    train, val = train.reset_index(drop=True), val.reset_index(drop=True)
    train.to_csv(os.path.join(data_dir, 'train.csv'), index=False)
    val.to_csv(os.path.join(data_dir, 'val.csv'), index=False)
    logging_data_process.info(f"train size:{len(train)}")
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'train.csv')}")
    logging_data_process.info(f"val size:{len(val)}")
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'val.csv')}")

    # Cross validation folds
    if folds > 1:
        logging_data_process.info(f'Folds: {folds}')
        X = train_val[['id', 'filenames']]
        y = train_val.iloc[:, 2:].apply(
            lambda x: np.argmax(x),
            axis=1)  # argmax is necessary for stratification
        skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)
        fold = 0
        for train_idx, val_idx in skf.split(X, y):
            fold += 1
            train_idx, val_idx = list(train_idx), list(val_idx)
            train, val = train_val.iloc[train_idx, :], train_val.iloc[
                val_idx, :]
            train, val = train.reset_index(drop=True), val.reset_index(
                drop=True)
            train.to_csv(os.path.join(data_dir, f'train{fold}.csv'),
                         index=False)
            val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False)
            logging_data_process.info(f"train{fold} size:{len(train)}")
            logging_data_process.info(
                f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}")
            logging_data_process.info(f"val{fold} size:{len(val)}")
            logging_data_process.info(
                f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")
Exemplo n.º 9
0
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer,
               scheduler, net_name, num_epochs):
    """
    Train and evaluate a net.
    """
    # Initialize logs
    fname = os.path.join(args.model_dir, f'train{fold}.log')
    logging_train = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'lr{fold}.log')
    logging_lr = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'bins{fold}.log')
    logging_bins = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'cats{fold}.log')
    logging_cats = myutils.setup_logger(fname)

    # Reproducibility
    myutils.myseed(seed=42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load initial weights
    net = net.to(device)
    #best_net_wts = copy.deepcopy(net.state_dict())
    best_acc, epoch = 0.0, 1

    # Initialize .tar files to save settings
    fname = f'last{fold}.tar'
    last_path = os.path.join(args.model_dir, fname)
    fname = f'best{fold}.tar'
    best_path = os.path.join(args.model_dir, fname)

    # To resume training for more epochs
    if args.resume:
        try:
            # Load last settings from .tar file
            last_checkpoint = torch.load(last_path)
            net.load_state_dict(last_checkpoint['net_state_dict'])
            optimizer.load_state_dict(last_checkpoint['optimizer_state_dict'])
            epoch = last_checkpoint[
                'epoch'] + 1  # Since last epoch was saved we start with the next one
            logging_process.info(
                f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}'
            )

            # Load best settings from .tar file
            best_checkpoint = torch.load(best_path)
            #best_net_wts = best_checkpoint['net_state_dict']
            best_acc = best_checkpoint['acc']

        except FileNotFoundError as err:
            # This error happens when folds are present
            # If interrupted on fold 1 then best best_checkpoint for fold 2 does
            # not exists. This is fixed like this.
            logging_process.info(f'Model: {args.model_dir}\tError: {err}')

    # TRAINING LOOP
    for epoch in range(epoch, num_epochs + 1):

        print(f'Epoch {epoch}/{num_epochs}')

        # To track values in each epoch
        tloss, tacc, vloss, vacc = '', '', '', ''
        tloss0, tacc0, vloss0, vacc0 = '', '', '', ''
        tloss1, tacc2, vloss3, vacc4 = '', '', '', ''

        # Each epoch has a training phase and a validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # Set net to training mode

                # Track learning rate for plot
                mylr_value = optimizer.param_groups[0]['lr']
                logging_lr.info(f'Epoch {epoch}\tlr: {mylr_value}')

            else:
                net.eval()  # Set net to evaluate mode

            # Track statistics
            running_loss0 = 0.0
            running_loss1 = 0.0

            running_corrects0 = 0
            running_corrects1 = 0

            # Iterate over data
            for index, inputs, bins_labels, cats_labels in tqdm(
                    dataloaders[phase]):
                inputs = inputs.to(device)
                bins_labels = bins_labels.to(device)
                cats_labels = cats_labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs_bins, outputs_cats = net(inputs)
                    #outputs_bins = torch.reshape(outputs_bins, (-1,)) # reshape added for binary
                    outputs_bins = outputs_bins.to(device)
                    outputs_cats = outputs_cats.to(device)

                    #loss0 = criterion[0](outputs_bins, bins_labels.float())# float added for binary
                    loss0 = criterion[0](outputs_bins, bins_labels)
                    loss1 = criterion[1](outputs_cats, cats_labels)
                    loss0 = loss0 * (2 / 307)
                    loss1 = loss1 * (305 / 307)

                    #loss0 = loss0 * (2/306)
                    #loss1 = loss1 * (304/306)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss = (loss0 + loss1) / 2
                        loss.backward()
                        optimizer.step()

                # Batch statistics
                running_loss0 += loss0.detach().item() * inputs.size(0)
                running_loss1 += loss1.detach().item() * inputs.size(0)

                #running_corrects0 += torch.sum(torch.round(outputs_bins) == bins_labels.data)
                running_corrects0 += torch.sum(
                    torch.max(outputs_bins, 1)[1] == bins_labels.data)
                running_corrects1 += torch.sum(
                    torch.max(outputs_cats, 1)[1] == cats_labels.data)

            # efficientnetb
            #if net_name.startswith('efficientnetb'):
            #    if phase == 'train':
            #        scheduler.step()

            # inceptionv
            #if net_name.startswith('inceptionv'):
            #    if phase == 'train':
            #        if (epoch % 2) == 0:
            #            scheduler.step()

            # Epoch statistics
            epoch_loss0 = running_loss0 / dataset_sizes[phase]
            epoch_loss1 = running_loss1 / dataset_sizes[phase]

            epoch_loss = epoch_loss0 + epoch_loss1

            epoch_acc0 = (running_corrects0.double() /
                          dataset_sizes[phase]) * (2 / 307)
            epoch_acc1 = (running_corrects1.double() /
                          dataset_sizes[phase]) * (305 / 307)

            epoch_acc = (epoch_acc0 + epoch_acc1) / 2

            #print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))
            #logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            print('{} bin_loss: {:.4f} bin_acc: {:.4f}'.format(
                phase, epoch_loss0, epoch_acc0))
            #logging_bins.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss0, epoch_acc0))

            print('{} cat_loss: {:.4f} cat_acc: {:.4f}'.format(
                phase, epoch_loss1, epoch_acc1))
            #logging_cats.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss1, epoch_acc1))
            if phase == 'train':
                tloss = epoch_loss
                tloss0 = epoch_loss0
                tloss1 = epoch_loss1

                tacc = epoch_acc
                tacc0 = epoch_acc0
                tacc1 = epoch_acc1

            if phase == 'val':
                vloss = epoch_loss
                vloss0 = epoch_loss0
                vloss1 = epoch_loss1

                vacc = epoch_acc
                vacc0 = epoch_acc0
                vacc1 = epoch_acc1

                logging_train.info(
                    'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}'
                    .format(epoch, tloss, tacc, vloss, vacc))
                logging_bins.info(
                    'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}'
                    .format(epoch, tloss0, tacc0, vloss0, vacc0))
                logging_cats.info(
                    'Epoch: {}\ttloss: {:.4f}\ttacc: {:.4f}\tvloss: {:.4f}\tvacc: {:.4f}'
                    .format(epoch, tloss1, tacc1, vloss1, vacc1))

                # Save last settings to .tar file
                torch.save(
                    {
                        'epoch': epoch,
                        'net_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': epoch_loss
                    }, last_path)

                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    #best_net_wts = net.state_dict()

                    # Save best settings to .tar file
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': net.state_dict(),  #best_net_wts
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': epoch_loss,
                            'acc': best_acc
                        },
                        best_path)

                    # Save best settings to .json file
                    best_metrics = {
                        f'loss{fold}': epoch_loss,
                        f'acc{fold}': best_acc.item()
                    }
                    fname = os.path.join(args.model_dir, f'metrics{fold}.json')
                    with open(fname, 'w') as f:
                        f.write(json.dumps(best_metrics))

                #vgg
                if net_name.startswith('vgg'):
                    scheduler.step(epoch_acc)

                # resnet
                #if net_name.startswith('resnet'):
                #    scheduler.step(epoch_loss)

    print('Best val Acc: {:4f}'.format(best_acc))
    logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(
        args.model_dir, fold, best_acc))
Exemplo n.º 10
0
def get_loaders(dfs, mean, std, size, batch_size, num_workers):
    """
    Function that takes a dictionary of dataframes and
    returns 2 dictionaries of pytorch dataloaders and dataset_sizes
    """
    # Reproducibility
    myutils.myseed(seed=42)

    # Custom pytorch dataloader for this dataset
    class Derm(Dataset):
        """
        Read a pandas dataframe with
        images paths and labels
        """
        def __init__(self, df, transform=None):
            self.df = df
            self.transform = transform

        def __len__(self):
            return len(self.df)

        def __getitem__(self, index):
            try:
                # Load image data and get label
                X = Image.open(self.df['filenames'][index]).convert('RGB')
                y = torch.tensor(self.df.iloc[index,2:])
            except IOError as err:
                pass

            if self.transform:
                X = self.transform(X)
            # Sanity check
            #print('id:', self.df['id'][index], 'label', y)
            return index, X, y

    # Transforms
    data_transforms = {'train' : transforms.Compose([transforms.Resize(size),
                                              transforms.CenterCrop((size,size)),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean,std)]),
                       'val' : transforms.Compose([transforms.Resize(size),
                                              transforms.CenterCrop((size,size)),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean,std)]),
                       'test' : transforms.Compose([transforms.Resize(size),
                                              transforms.CenterCrop((size,size)),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean,std)]),
                       'unknown' : transforms.Compose([transforms.Resize(size),
                                              transforms.CenterCrop((size,size)),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean,std)])}

    # Sets
    image_datasets = {x: Derm(dfs[x], transform=data_transforms[x]) for x in dfs.keys()}
    # Sizes
    dataset_sizes = {x: len(image_datasets[x]) for x in dfs.keys()}
    # Loaders
    dataloaders = {x: DataLoader(image_datasets[x], batch_size, num_workers, pin_memory=False) for x in dfs.keys()}

    return dataloaders, dataset_sizes
Exemplo n.º 11
0
def data_split(data_dir, folds):
    """
    Function that takes a data_dir and a number of folds,
    and splits images in data_dir into
    training(80%) and testing(20%) data.

    For fit.py training data is further splitted into
    training and validation sets.

    If cross validation is needed, training data is also splitted into
    train and validation folds.
    """
    # Reproducibility
    myutils.myseed(seed=42)
    seed = 42

    # Load the data with image paths and labels
    df = pd.read_csv(f'{data_dir}.csv')
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # Test
    train_val, test = train_test_split(df,
                                       test_size=0.2,
                                       random_state=seed,
                                       shuffle=True)
    train_val, test = train_val.reset_index(drop=True), test.reset_index(
        drop=True)
    test.to_csv(os.path.join(data_dir, 'test.csv'), index=False)
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}")

    # Train and validation
    train, val = train_test_split(train_val,
                                  test_size=0.2,
                                  random_state=seed,
                                  shuffle=True)
    train, val = train_val.reset_index(drop=True), val.reset_index(drop=True)
    train.to_csv(os.path.join(data_dir, 'train.csv'), index=False)
    val.to_csv(os.path.join(data_dir, 'val.csv'), index=False)
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'train.csv')}")
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'val.csv')}")

    # Cross validation folds
    if folds > 1:
        logging_data_process.info(f'Folds: {folds}')
        X = train_val[['image_path']]
        y = train_val[['label_code']]
        skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)
        fold = 0
        for train_idx, val_idx in skf.split(X, y):
            fold += 1
            train_idx, val_idx = list(train_idx), list(val_idx)
            train, val = train_val.iloc[train_idx, :], train_val.iloc[
                val_idx, :]
            train, val = train.reset_index(drop=True), val.reset_index(
                drop=True)
            train.to_csv(os.path.join(data_dir, f'train{fold}.csv'),
                         index=False)
            val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False)
            logging_data_process.info(
                f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}")
            logging_data_process.info(
                f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")
Exemplo n.º 12
0
def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer,
               scheduler, num_epochs):
    """
    Train and evaluate a net.
    """
    # Initialize logs
    fname = os.path.join(args.model_dir, f'train{fold}.log')
    logging_train = myutils.setup_logger(fname)

    # Reproducibility
    myutils.myseed(seed=42)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Load initial weights
    net = net.to(device)
    best_net_wts = copy.deepcopy(net.state_dict())
    best_acc, epoch = 0.0, 1

    # Initialize .tar files to save settings
    fname = f'last{fold}.tar'
    last_path = os.path.join(args.model_dir, fname)
    fname = f'best{fold}.tar'
    best_path = os.path.join(args.model_dir, fname)

    # To resume training for more epochs
    if args.resume:
        try:
            # Load last settings from .tar file
            last_checkpoint = torch.load(last_path)
            net.load_state_dict(last_checkpoint['net_state_dict'])
            optimizer.load_state_dict(last_checkpoint['optimizer_state_dict'])
            epoch = last_checkpoint[
                'epoch'] + 1  # Since last epoch was saved we start with the next one
            logging_process.info(
                f'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}'
            )

            # Load best settings from .tar file
            best_checkpoint = torch.load(best_path)
            best_net_wts = best_checkpoint['net_state_dict']
            best_acc = best_checkpoint['acc']

        except FileNotFoundError as err:
            # This error happens when folds are present
            # If interrupted on fold 1 then best best_checkpoint for fold 2 does
            # not exists. This is fixed like this.
            logging_process.info(f'Model: {args.model_dir}\tError: {err}')

    # Initialize early stop settings
    best_val_loss, epochs_no_improve, patience = np.Inf, 0, 5

    # TRAINING LOOP
    for epoch in range(epoch, num_epochs + 1):

        # Early stop
        if epochs_no_improve == patience:
            print('Early stop')
            logging_process.info(
                f'Model: {args.model_dir}\tFold:{fold}\tEarly stop: {epoch}')
            break

        print(f'Epoch {epoch}/{num_epochs}')
        logging_train.info(f'Epoch {epoch}/{num_epochs}')

        # Each epoch has a training phase and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # Set net to training mode
            else:
                net.eval()  # Set net to evaluate mode

            # Track statistics
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    probs, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Batch statistics
                running_loss += loss.item() * inputs.size(
                    0)  # This is batch loss
                running_corrects += torch.sum(
                    preds == labels.data)  # This is batch accuracy

            if phase == 'train':
                scheduler.step()

            # Epoch statistics
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))
            logging_train.info('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'val':
                # Best loss tracking for early stop
                if epoch_loss < best_val_loss:
                    best_val_loss = epoch_loss
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1

                # Save last settings to .tar file
                torch.save(
                    {
                        'epoch': epoch,
                        'net_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': epoch_loss
                    }, last_path)

                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_net_wts = net.state_dict()

                    # Save best settings to .tar file
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': best_net_wts,
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': epoch_loss,
                            'acc': best_acc
                        }, best_path)

                    # Save best settings to .json file
                    best_metrics = {
                        f'loss{fold}': epoch_loss,
                        f'acc{fold}': best_acc.item()
                    }
                    fname = os.path.join(args.model_dir, f'metrics{fold}.json')
                    with open(fname, 'w') as f:
                        f.write(json.dumps(best_metrics))

    print('Best val Acc: {:4f}'.format(best_acc))
    logging_process.info('Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(
        args.model_dir, fold, best_acc))
def load_data(data_dir):
    """
    Function that takes a folder, finds all .jpg files inside the folder,
    and creates a dataframe.
    """
    # Reproducibility
    myutils.myseed(seed=42)

    # Get the image paths
    filenames = myutils.run_fast_scandir(data_dir, [".jpg"])

    #   BINARY
    dfa = pd.DataFrame(data=filenames, columns=['filenames'])
    # Get the label from nth folder starting from the parent:
    outlevel = 4  # fname = '/scratch/s181423_data/data_bin/label/image.jpg'
    dfa['label0'] = dfa['filenames'].apply(lambda x: x.split('/')[outlevel])
    # Get the id from the basename
    dfa['id'] = dfa['filenames'].apply(lambda x: os.path.basename(x))
    # Get label as one hot encoded values
    dfa = dfa.set_index(['id', 'filenames'])
    #dfa['label0'] = dfa['label0'].astype('category')

    # Create a subset of skin to get the disease categories only for these pictures
    df_skin_only = dfa[dfa['label0'] == 'skin']

    #   CATEGORIES
    dfb = pd.DataFrame(data=filenames, columns=['filenames'])
    # Get the label from nth folder starting from the parent:
    outlevel = 5  # fname = '/scratch/s181423_data/data_bin/label/image.jpg'
    dfb['label1'] = dfb['filenames'].apply(lambda x: x.split('/')[outlevel])
    # Get the id from the basename
    dfb['id'] = dfb['filenames'].apply(lambda x: os.path.basename(x))
    # Get label as one hot encoded values
    dfb = dfb.set_index(['id', 'filenames'])
    #dfb['label1'] = dfb['label1'].astype('category')

    # Get disease categories only for the skin images
    df_diseases = pd.concat([df_skin_only, dfb],
                            axis=1,
                            sort=False,
                            join='inner').drop(['label0'], axis=1)

    # Join binary and categories labels
    df = pd.concat([dfa, df_diseases], axis=1, sort=False)
    df = df.fillna('AAA')
    df['label0'] = df['label0'].astype('category')
    df['label1'] = df['label1'].astype('category')
    mapping = {}
    mapping_binary = dict(enumerate(df['label0'].cat.categories))
    mapping_categories = dict(enumerate(df['label1'].cat.categories))
    df['label0'] = pd.Categorical(df['label0']).codes
    df['label1'] = pd.Categorical(df['label1']).codes

    mapping['mapping_binary'] = mapping_binary
    mapping['mapping_categories'] = mapping_categories
    #df = pd.get_dummies(df, prefix='', prefix_sep='')
    df = df.reset_index()

    # Resample the minority classes to have them in training and testing
    counts_df = pd.DataFrame(df['label1'].value_counts())
    labels_with_one_example = list(counts_df[counts_df['label1'] < 2].index)
    duplicates_df = df[df['label1'].isin(labels_with_one_example)]
    # 5-plicate df for stratify 20% of 5 is 1, for 80% train, 20% test
    df_copy = duplicates_df
    df = pd.concat([df, df_copy, df_copy, df_copy, df_copy])

    # Save the data as a .csv file
    df.to_csv(f'{data_dir}.csv', index=False)
    logging_data_process.info(f'Saved: {data_dir}.csv')

    # Save the mappings as a .json file
    with open('dicts/mapping.json', 'w') as f:
        f.write(json.dumps(mapping))
        logging_data_process.info('Saved: dicts/mapping.json')
def data_split(data_dir, folds):
    """
    Function that takes a data_dir and a number of folds,
    and splits images in data_dir into
    training(80%) and testing(20%) data.

    For fit.py training data is further splitted into
    training and validation sets.

    If cross validation is needed, training data is also splitted into
    train and validation folds.
    """
    def rep_sample(df, col, n, *args, **kwargs):
        nu = df[col].nunique()
        m = len(df)
        mpb = n // nu
        mku = n - mpb * nu
        fills = np.zeros(nu)
        fills[:mku] = 1
        sample_sizes = (np.ones(nu) * mpb + fills).astype(int)
        gb = df.groupby(col)
        sample = lambda sub_df, i: sub_df.sample(
            sample_sizes[i], *args, **kwargs, replace=True)
        subs = [sample(sub_df, i) for i, (_, sub_df) in enumerate(gb)]
        return pd.concat(subs)

    # Reproducibility
    myutils.myseed(seed=42)
    seed = 42

    # Load the data with image paths and labels
    df = pd.read_csv(f'{data_dir}.csv')
    #df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    logging_data_process.info(f"all data size:{len(df)}")

    # Test
    y = list(df['label1'])
    train_val, test = train_test_split(df,
                                       test_size=0.2,
                                       random_state=seed,
                                       shuffle=True,
                                       stratify=y)
    train_val, test = train_val.reset_index(drop=True), test.reset_index(
        drop=True)

    # Remove samples used in training from test.csv
    ids = list(train_val.id)
    test = test[~test.id.isin(ids)]
    test

    print(f'test:{len(test.label1.value_counts())}')
    test.to_csv(os.path.join(data_dir, 'test.csv'), index=False)
    logging_data_process.info(f"test size:{len(test)}")
    logging_data_process.info(f"train_val size:{len(train_val)}")
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}")

    if folds == 1:
        logging_data_process.info(f'Folds: {folds}')
        fold = 1
    # Train and validation
    y = list(train_val['label1'])
    #X = train_val[['id','filenames']]
    #y = train_val.iloc[:,2:].apply(lambda x: np.argmax(x), axis=1)  # argmax is necessary for stratification
    # categories did not allow stratify because some classes have just 1 example
    train, val = train_test_split(train_val,
                                  test_size=0.2,
                                  random_state=seed,
                                  shuffle=True,
                                  stratify=y)
    train, val = train.reset_index(drop=True), val.reset_index(drop=True)

    size = len(train['label1'].unique()) * 1000
    print(f'size: {size}')
    train = rep_sample(train, 'label1', size)
    train = sklearn.utils.shuffle(train)
    print(f'train{fold}:{len(train.label1.value_counts())}')
    print(f'val{fold}:{len(val.label1.value_counts())}')

    train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False)
    val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False)
    logging_data_process.info(f"train{fold} size:{len(train)}")
    logging_data_process.info(
        f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}")
    logging_data_process.info(f"val{fold} size:{len(val)}")
    logging_data_process.info(
        f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")

    # Cross validation folds
    if folds > 1:
        print('WARNING: Karen has not implemented this yet!')
    '''
def data_split(data_dir, folds):
    """
    Function that takes a data_dir and a number of folds,
    and splits images in data_dir into
    training(80%) and testing(20%) data.

    If cross validation is needed, training data is also splitted into
    train and validation folds.
    """
    def rep_sample(df, col, n, *args, **kwargs):
        nu = df[col].nunique()
        m = len(df)
        mpb = n // nu
        mku = n - mpb * nu
        fills = np.zeros(nu)
        fills[:mku] = 1
        sample_sizes = (np.ones(nu) * mpb + fills).astype(int)
        gb = df.groupby(col)
        sample = lambda sub_df, i: sub_df.sample(sample_sizes[i], *args, **kwargs, replace=True)
        subs = [sample(sub_df, i) for i, (_, sub_df) in enumerate(gb)]
        return pd.concat(subs)

    # Reproducibility
    myutils.myseed(seed=42)
    seed = 42

    # Load the data with image paths and labels
    df = pd.read_csv(f'{data_dir}.csv')
    logging_data_process.info(f"all data size:{len(df)}")

    # Test
    y = list(df['label'])
    train_val, test = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True, stratify=y)
    train_val, test = train_val.reset_index(drop=True), test.reset_index(drop=True)

    # Remove samples used in training from test.csv
    ids = list(train_val.id)
    test = test[~test.id.isin(ids)]
    test

    print(f'test:{len(test.label.value_counts())}')
    test.to_csv(os.path.join(data_dir, 'test.csv'), index=False)
    logging_data_process.info(f"test size:{len(test)}")
    logging_data_process.info(f"train_val size:{len(train_val)}")
    logging_data_process.info(f"Saved: {os.path.join(data_dir, 'test.csv')}")

    # Cross validation folds
    if folds == 1:
        logging_data_process.info(f'Folds: {folds}')
        X = train_val[['id','filenames']]
        y = list(train_val['label'])
        n_splits = 2 # Put 2 and break when fold 1 finishes
        #skf = StratifiedKFold(n_splits, random_state=seed, shuffle=True)
        skf = StratifiedShuffleSplit(n_splits, random_state=seed, test_size=0.2)
        fold = 0
        for train_idx, val_idx in skf.split(X, y):
            fold += 1
            train_idx, val_idx = list(train_idx), list(val_idx)
            train, val = train_val.iloc[train_idx,:], train_val.iloc[val_idx,:]
            train, val = train.reset_index(drop=True), val.reset_index(drop=True)

            # To overfit to the first balanced batch
            #size = 304
            #train = rep_sample(train, 'label', size)
            #train =sklearn.utils.shuffle(train)
            print(f'train{fold}:{len(train.label.value_counts())}')
            train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False)
            val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False)
            logging_data_process.info(f"train{fold} size:{len(train)}")
            logging_data_process.info(f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}")
            logging_data_process.info(f"val{fold} size:{len(val)}")
            logging_data_process.info(f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")
            break

    if folds > 1:
        logging_data_process.info(f'Folds: {folds}')
        X = train_val[['id','filenames']]
        y = list(train_val['label'])
        #skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)
        skf = StratifiedShuffleSplit(n_splits=folds, random_state=seed, test_size=0.2)
        fold = 0
        for train_idx, val_idx in skf.split(X, y):
            fold += 1
            train_idx, val_idx = list(train_idx), list(val_idx)
            train, val = train_val.iloc[train_idx,:], train_val.iloc[val_idx,:]
            train, val = train.reset_index(drop=True), val.reset_index(drop=True)

            # To overfit to the first balanced batch
            #size = 304
            #train = rep_sample(train, 'label', size)
            #train =sklearn.utils.shuffle(train)
            print(f'train{fold}:{len(train.label.value_counts())}')
            train.to_csv(os.path.join(data_dir, f'train{fold}.csv'), index=False)
            val.to_csv(os.path.join(data_dir, f'val{fold}.csv'), index=False)
            logging_data_process.info(f"train{fold} size:{len(train)}")
            logging_data_process.info(f"Saved: {os.path.join(data_dir, f'train{fold}.csv')}")
            logging_data_process.info(f"val{fold} size:{len(val)}")
            logging_data_process.info(f"Saved: {os.path.join(data_dir, f'val{fold}.csv')}")