def get_vae(args): vae = DiscreteVAE(image_size=args.size, num_layers=args.vae_layers, num_tokens=8192, codebook_dim=args.codebook_dims, num_resnet_blocks=9, hidden_dim=128, temperature=args.temperature) if args.vae is not None and os.path.isfile(args.vae): print(f"loading state dict from {args.vae}") vae.load_state_dict(torch.load(args.vae)) vae.to(args.device) return vae
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) #(0.267, 0.233, 0.234)) ]) vae = DiscreteVAE(image_size=256, num_layers=3, num_tokens=2048, codebook_dim=256, hidden_dim=128, temperature=0.9) # load pretrained vae vae_dict = torch.load("./models/" + vaename + "-" + str(load_epoch) + ".pth") vae.load_state_dict(vae_dict) vae.to(device) dalle = DALLE( dim=256, #512, vae= vae, # automatically infer (1) image sequence length and (2) number of image tokens num_text_tokens=10000, # vocab size for text text_seq_len=256, # text sequence length depth=6, # should be 64 heads=8, # attention heads dim_head=64, # attention head dimension attn_dropout=0.1, # attention dropout ff_dropout=0.1 # feedforward dropout ) # load pretrained dalle if continuing training