예제 #1
0
                drop_last=True,
                sampler=data_sampler)

# initialize DALL-E

dalle = DALLE(vae=vae, **dalle_params)
if args.fp16:
    dalle = dalle.half()
dalle = dalle.cuda()

if RESUME:
    dalle.load_state_dict(weights)

# optimizer

opt = Adam(dalle.parameters(), lr=LEARNING_RATE)

if LR_DECAY:
    scheduler = ReduceLROnPlateau(
        opt,
        mode="min",
        factor=0.5,
        patience=10,
        cooldown=10,
        min_lr=1e-6,
        verbose=True,
    )

if distr_backend.is_root_worker():
    # experiment tracker
예제 #2
0
assert len(ds) > 0, 'dataset is empty'
print(f'{len(ds)} image-text pairs found for training')

dl = DataLoader(ds, batch_size = BATCH_SIZE, shuffle = True, drop_last = True)

# initialize DALL-E

dalle = DALLE(**dalle_params).cuda()

if exists(args.dalle_path):
    dalle.load_state_dict(weights)

# optimizer

opt = Adam(dalle.parameters(), lr = LEARNING_RATE)

# experiment tracker

import wandb

wandb.config.depth = DEPTH
wandb.config.heads = HEADS
wandb.config.dim_head = DIM_HEAD

wandb.init(project = 'dalle_train_transformer', resume = exists(args.dalle_path))

# training

for epoch in range(EPOCHS):
    for i, (text, images, mask) in enumerate(dl):
예제 #3
0
torch.save(vae.state_dict(), "Vae-small.pth")

dalle = DALLE(
    dim=1024,
    vae=
    vae,  # automatically infer (1) image sequence length and (2) number of image tokens
    num_text_tokens=NUM_TOKENS,  # vocab size for text
    text_seq_len=TEXTSEQLEN,  # text sequence length
    depth=12,  # should aim to be 64
    heads=16,  # attention heads
    dim_head=64,  # attention head dimension
    attn_dropout=0.1,  # attention dropout
    ff_dropout=0.1  # feedforward dropout
).cuda()

optimizerDALLE = torch.optim.Adam(dalle.parameters(), lr=learning_rate)
DALLEloss = []

for epoch in range(EPOCHS):
    for i in range(DATASET_SIZE):
        #print(i,":",tokenDset.getRand(i),img.size())
        optimizerDALLE.zero_grad()
        img, strs = cap[i]
        #print(img.size())
        img = img.unsqueeze(0).cuda()
        if i % 10 == 0:
            print("DALLE epoch {} / {}".format(i + epoch * DATASET_SIZE,
                                               EPOCHS * DATASET_SIZE))
        try:
            textToken, mask = fixlen([tokenDset.getRand(i)])
        except KeyError:
예제 #4
0
dl = DataLoader(ds, batch_size = BATCH_SIZE, shuffle = True, drop_last = True)

# initialize DALL-E

dalle = DALLE(**dalle_params).cuda()

if RESUME:
    dalle.load_state_dict(weights)
    



# optimizer

opt = Adam(dalle.parameters(), lr = LEARNING_RATE)

# experiment tracker

import wandb

wandb.config.depth = DEPTH
wandb.config.heads = HEADS
wandb.config.dim_head = DIM_HEAD

wandb.init(project = 'dalle_train_transformer', resume = RESUME)

# training

for epoch in range(epoch_start, EPOCHS):
    for i, (text, images, mask) in enumerate(dl):
예제 #5
0
                sampler=data_sampler)

# initialize DALL-E

dalle = DALLE(vae=vae, **dalle_params)
if not using_deepspeed:
    if args.fp16:
        dalle = dalle.half()
    dalle = dalle.cuda()

if RESUME:
    dalle.load_state_dict(weights)

# optimizer

opt = AdamW(dalle.parameters(),
            lr=LEARNING_RATE,
            betas=(0.9, 0.96),
            weight_decay=4.5e-2,
            amsgrad=True)

if LR_DECAY:
    scheduler = ReduceLROnPlateau(
        opt,
        mode="min",
        factor=0.5,
        patience=10,
        cooldown=10,
        min_lr=1e-6,
        verbose=True,
    )
예제 #6
0
        i_data = []
        c_data = []
        for i in range(0, self.batchsize):
            i_data.append(self.data[self.index][0])
            c_tokens = [0] * 256  # fill to match text_seq_len
            c_tokens_ = self.data[self.index][1]
            c_tokens[:len(c_tokens_)] = c_tokens_
            c_data.append(c_tokens)
            self.index += 1
            if self.index == self.len:
                self.end = True
                break
        return i_data, c_data


optimizer = optim.Adam(dalle.parameters(), lr=lr)

for epoch in range(start_epoch, start_epoch + n_epochs):
    batch_idx = 0
    train_loss = 0
    dset = ImageCaptions(data, batchsize=batchSize)  # initialize iterator

    for i, c in dset:  # loop through dataset by minibatch
        text = torch.LongTensor(c)  # a minibatch of text (numerical tokens)
        images = torch.zeros(len(i), 3, 256, 256)  # placeholder for images

        text = text.to(device)
        #print(text)

        # fetch images into tensor based on paths given in minibatch
        ix = 0