drop_last=True, sampler=data_sampler) # initialize DALL-E dalle = DALLE(vae=vae, **dalle_params) if args.fp16: dalle = dalle.half() dalle = dalle.cuda() if RESUME: dalle.load_state_dict(weights) # optimizer opt = Adam(dalle.parameters(), lr=LEARNING_RATE) if LR_DECAY: scheduler = ReduceLROnPlateau( opt, mode="min", factor=0.5, patience=10, cooldown=10, min_lr=1e-6, verbose=True, ) if distr_backend.is_root_worker(): # experiment tracker
assert len(ds) > 0, 'dataset is empty' print(f'{len(ds)} image-text pairs found for training') dl = DataLoader(ds, batch_size = BATCH_SIZE, shuffle = True, drop_last = True) # initialize DALL-E dalle = DALLE(**dalle_params).cuda() if exists(args.dalle_path): dalle.load_state_dict(weights) # optimizer opt = Adam(dalle.parameters(), lr = LEARNING_RATE) # experiment tracker import wandb wandb.config.depth = DEPTH wandb.config.heads = HEADS wandb.config.dim_head = DIM_HEAD wandb.init(project = 'dalle_train_transformer', resume = exists(args.dalle_path)) # training for epoch in range(EPOCHS): for i, (text, images, mask) in enumerate(dl):
torch.save(vae.state_dict(), "Vae-small.pth") dalle = DALLE( dim=1024, vae= vae, # automatically infer (1) image sequence length and (2) number of image tokens num_text_tokens=NUM_TOKENS, # vocab size for text text_seq_len=TEXTSEQLEN, # text sequence length depth=12, # should aim to be 64 heads=16, # attention heads dim_head=64, # attention head dimension attn_dropout=0.1, # attention dropout ff_dropout=0.1 # feedforward dropout ).cuda() optimizerDALLE = torch.optim.Adam(dalle.parameters(), lr=learning_rate) DALLEloss = [] for epoch in range(EPOCHS): for i in range(DATASET_SIZE): #print(i,":",tokenDset.getRand(i),img.size()) optimizerDALLE.zero_grad() img, strs = cap[i] #print(img.size()) img = img.unsqueeze(0).cuda() if i % 10 == 0: print("DALLE epoch {} / {}".format(i + epoch * DATASET_SIZE, EPOCHS * DATASET_SIZE)) try: textToken, mask = fixlen([tokenDset.getRand(i)]) except KeyError:
dl = DataLoader(ds, batch_size = BATCH_SIZE, shuffle = True, drop_last = True) # initialize DALL-E dalle = DALLE(**dalle_params).cuda() if RESUME: dalle.load_state_dict(weights) # optimizer opt = Adam(dalle.parameters(), lr = LEARNING_RATE) # experiment tracker import wandb wandb.config.depth = DEPTH wandb.config.heads = HEADS wandb.config.dim_head = DIM_HEAD wandb.init(project = 'dalle_train_transformer', resume = RESUME) # training for epoch in range(epoch_start, EPOCHS): for i, (text, images, mask) in enumerate(dl):
sampler=data_sampler) # initialize DALL-E dalle = DALLE(vae=vae, **dalle_params) if not using_deepspeed: if args.fp16: dalle = dalle.half() dalle = dalle.cuda() if RESUME: dalle.load_state_dict(weights) # optimizer opt = AdamW(dalle.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.96), weight_decay=4.5e-2, amsgrad=True) if LR_DECAY: scheduler = ReduceLROnPlateau( opt, mode="min", factor=0.5, patience=10, cooldown=10, min_lr=1e-6, verbose=True, )
i_data = [] c_data = [] for i in range(0, self.batchsize): i_data.append(self.data[self.index][0]) c_tokens = [0] * 256 # fill to match text_seq_len c_tokens_ = self.data[self.index][1] c_tokens[:len(c_tokens_)] = c_tokens_ c_data.append(c_tokens) self.index += 1 if self.index == self.len: self.end = True break return i_data, c_data optimizer = optim.Adam(dalle.parameters(), lr=lr) for epoch in range(start_epoch, start_epoch + n_epochs): batch_idx = 0 train_loss = 0 dset = ImageCaptions(data, batchsize=batchSize) # initialize iterator for i, c in dset: # loop through dataset by minibatch text = torch.LongTensor(c) # a minibatch of text (numerical tokens) images = torch.zeros(len(i), 3, 256, 256) # placeholder for images text = text.to(device) #print(text) # fetch images into tensor based on paths given in minibatch ix = 0