Exemplo n.º 1
0
def main():

    # Note that msms.py has many other imports
    import msms
    import torch
    print("Reading data")
    # Initialize a dataset
    example_set = msms.Dataset("data", "metadata/lookup.npy",
                               "metadata/ms_param.csv")
    print("Initializing dataloader")
    # Initialize a dataloader with the dataset
    # - Note that you currently can't add more `num_workers`, maybe due to fighting over GPU resources?
    example_loader = torch.utils.data.DataLoader(example_set,
                                                 batch_size=2,
                                                 shuffle=True,
                                                 num_workers=0)

    # Set the network parameters
    # Max pooling size for all convolutional layers. Our low `num_indivs` generally makes this a bad idea, but you can make it work with few convolutional layers.
    pool_size = 1
    # The number of channels per convolution layer (think colour channels in an image), input channels assumed to be 1
    channels = [4, 6, 8]
    # The size of the kernel at each convolution step. Length of this list should match length of `channels`
    kernels = [5, 4, 3]
    # The nodes in each hidden fully connected layer. The last should be the number of labels. The length of the list is independent of the other lists.
    nodes = [500, 100, 5]
    print("Creating neural network")
    # Create the network with the given parameters and send it to the gpu
    example_net = msms.Net(example_set.num_indivs, example_set.num_sites,
                           pool_size, channels, kernels, nodes).cuda()
    print("Training neural network")
    # Try running the network
    for snp, pos, label in example_loader:
        # The data loader reads in each file as though it were a training example
        # As each file is actually a collection of training examples (a chunk), we must reshape with `view()`
        # We also need to add a dimension to snp (with unsqueeze) to indicate there is only one colour channel
        snp = snp.view(-1, example_set.num_indivs,
                       example_set.num_sites).unsqueeze(1)
        pos = pos.view(-1, example_set.num_sites)
        label = label.view(-1)

        # Perform one forward pass
        out = example_net(snp, pos)

        # Print output and shape
        print(out)
        print(out.shape)
Exemplo n.º 2
0
def main():

    # Start timing
    tic = time.perf_counter()

    # Initialize dataset
    ds = msms.Dataset("../pipeline/data", "../pipeline/metadata/lookup.npy",
                      "../pipeline/metadata/ms_param.csv")

    # Initialize data loader
    dataloader = torch.utils.data.DataLoader(
        ds,
        batch_size=20,
        shuffle=False,  # NO SHUFFLE
        num_workers=0)

    # Set network parameters
    pool_size = 1
    channels = [4, 6, 8]
    kernels = [5, 4, 3]
    nodes = [500, 100, 5]

    # Create model and wrap in DDP
    net = msms.Net(ds.num_indivs, ds.num_sites, pool_size, channels, kernels,
                   nodes).cuda()

    # Define criterion and optimizer functions
    optimizer = torch.optim.SGD(net.parameters(), 1e-4)
    total_step = len(dataloader)

    # Extract info from dataloader and run network
    for epoch in range(2):
        count = 0
        for snp, pos, label in dataloader:
            count += 1

            # Reshape each chunk of training examples
            snp = snp.view(-1, ds.num_indivs, ds.num_sites).unsqueeze(1)
            pos = pos.view(-1, ds.num_sites)
            print(f"label shape before reshape: {label.shape}")
            label = label.view(-1)
            label_ohe = F.one_hot(label)
            label_ohe = label_ohe.to(torch.float32)

            # Perform one forward pass
            out = net(snp, pos)
            out = out.to(torch.float32)
            print(f"net output shape: {out.shape}")
            print(f"label tensor shape: {label.shape}")
            loss = F.mse_loss(out, label_ohe).cuda()

            # Perform backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (count) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1, 2, count, total_step, loss.item()))

    toc = time.perf_counter()
    print(f"Total time run: {(toc-tic)/60:0.4f} minutes")
Exemplo n.º 3
0
def train(gpu, args):

    # Start timing
    tic = time.perf_counter()

    # Calculate GPU rank
    rank = args.nr * args.gpus + gpu
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.world_size,
                            rank=rank)

    torch.cuda.set_device(gpu)

    # Decide on train / dev split
    lookup = np.load(args.metadata, allow_pickle=True).item()
    num_dev = np.int(min(np.floor(lookup['num_files'] * args.test_prop), 2**5))
    shuffled_ids = np.random.choice(range(lookup['num_files']),
                                    lookup['num_files'],
                                    replace=False)
    train_ids = shuffled_ids[:-num_dev]
    dev_ids = shuffled_ids[-num_dev:]

    # Initialize dataset
    train_set = msms.Dataset(args.data, args.metadata, args.params, train_ids)
    dev_set = msms.Dataset(args.data, args.metadata, args.params, dev_ids)

    # Create a training sampler for DDP
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set, num_replicas=args.world_size, rank=rank)

    # Create a dev sampler for DDP
    dev_sampler = torch.utils.data.distributed.DistributedSampler(
        dev_set, num_replicas=args.world_size, rank=rank)

    # Initialize data loaders
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=args.batch_size,
        shuffle=False,  # NO SHUFFLE
        num_workers=0,
        sampler=train_sampler)
    dev_loader = torch.utils.data.DataLoader(
        dev_set,
        batch_size=args.batch_size,
        shuffle=False,  # NO SHUFFLE
        num_workers=0,
        sampler=dev_sampler)

    # Set network parameters
    try:
        channels = list(map(int, args.channels.split(',')))
        kernels = list(map(int, args.kernels.split(',')))
        pools = list(map(int, args.pools.split(',')))
        nodes = list(map(int, args.hiddennodes.split(',')))
    except ValueError:
        raise Exception(
            "One of the network hyperparameters (channels, kernels, pools, or hidde nods) could not be interpreted as a comma-delimited string of integers."
        )

    # Create model and wrap in DDP
    net = msms.Net(train_set.num_indivs, train_set.num_sites, channels,
                   kernels, pools, nodes, train_set.num_labels).cuda()
    net = nn.parallel.DistributedDataParallel(net, device_ids=[gpu])

    # Define criterion functions
    criterion = torch.nn.CrossEntropyLoss()

    # Initialize overall loss accumulator
    losses = []
    report_every = args.report_every
    epoch_times = []

    # Extract info from dataloader and run network
    for epoch in range(args.epochs):

        epoch_start = time.time()
        lr = args.lr_0 * (args.lr_r**epoch)
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=lr,
                                     weight_decay=args.l2_lambda)

        # Init running loss accumulator
        running_loss = 0.
        running_correct = 0.

        for i, (snp, pos, label) in enumerate(train_loader):

            # Reshape each chunk of training examples
            snp = snp.view(-1, train_set.num_indivs,
                           train_set.num_sites).unsqueeze(1)
            pos = pos.view(-1, train_set.num_sites)
            label = label.view(-1)

            # Perform one forward pass
            out = net(snp, pos)
            loss = criterion(out, label)

            # Perform backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate number correct (hardmax) for this minibatch
            # add to running total
            temp = torch.argmax(out, 1)
            running_correct += (temp == label).float().sum()

            # Print output and shape
            running_loss += loss.item()
            if i % report_every == (report_every - 1) and (len(
                    train_set.filelist) / args.batch_size) - i > report_every:
                # Print running loss and accuracy. Format is [epoch, minibatch]
                print(
                    f'[{epoch}, {i + 1}] Loss: {running_loss / report_every} Acc: {running_correct / (train_set.num_sims * args.batch_size * report_every) * 100}'
                )
                losses.append(running_loss)
                # reset accumulators
                running_loss = 0.
                running_correct = 0.

        epoch_end = time.time()
        epoch_times.append(epoch_end - epoch_start)
        print(f"Single epoch training time: {epoch_end - epoch_start}")

    avg_epoch_time = np.mean(np.array(epoch_times))
    print(f"Iteration time: {avg_epoch_time / 8}")
    print("Done Training")

    # Check testing accuracy
    running_correct = 0.
    with torch.no_grad():
        for snp, pos, label in dev_loader:
            # Reshape!
            snp = snp.view(-1, train_set.num_indivs,
                           train_set.num_sites).unsqueeze(1)
            pos = pos.view(-1, train_set.num_sites)
            label = label.view(-1)

            # Predict and count number of correct labels
            out = torch.argmax(net(snp, pos), 1)
            running_correct += (out == label).float().sum()

    toc = time.perf_counter()
    if gpu == 0:
        print(f"Total time run: {(toc-tic)/60:0.4f} minutes")

#    print(f'Dev Accuracy: {running_correct / (dev_set.num_sims * num_dev) * 100}')

# Save recorded losses for plotting
    np.save("training_loss.npy", losses)
Exemplo n.º 4
0
# The nodes in each hidden fully connected layer. The length of the list is independent of the other lists.
nodes = [500, 100]
# The number of distinct label classes
num_labels = 4

# Setup hyperparameters and loss function
epochs = 4
batch_size = 4
report_every = 2 # How many minibatches to process before printing and recording loss and (hardmax) accuracy 
lr_0 = 0.00005 # Initial learning rate
lr_r = 0.8 # Learning rate decary rate
l2_lambd = 0.5 # Weight decay rate (L2 regularization) 
criterion = torch.nn.CrossEntropyLoss()

# Initialize datasets
train_set = msms.Dataset("data", "metadata/lookup.npy", "metadata/ms_param.csv", train_ids)
dev_set = msms.Dataset("data", "metadata/lookup.npy", "metadata/ms_param.csv", dev_ids)

# Initialize dataloaders
# - Note that you currently can't add more `num_workers`, maybe due to fighting over GPU resources?
train_loader = torch.utils.data.DataLoader(train_set, batch_size = batch_size, shuffle = True, num_workers = 0)
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size = batch_size, shuffle = True, num_workers = 0)

# Create the network with the given parameters and send it to the gpu
net = msms.Net(train_set.num_indivs, train_set.num_sites, channels, kernels, pools, nodes, num_labels).cuda()

# Init overall loss accumulator
losses = []

# Try running the network
for epoch in range(epochs):