def __init__(self, logger, learning_rate, input_dim, z_dim, ae_h_dims, *args, **kwargs): self.scope = "AE" self.logger = logger self.learning_rate = learning_rate self.input_dim = input_dim self.z_dim = z_dim self.enc_layer_dims = [input_dim, *ae_h_dims, z_dim] self.dec_layer_dims = [z_dim, *list(reversed(ae_h_dims)), input_dim] #todo : just reverse enc_layer_dims self.logger.info("[*] Building AE model") with tf.variable_scope(self.scope): self.input = tf.placeholder(tf.float32, [None, self.input_dim]) enc = Encoder(self.enc_layer_dims) dec = Decoder(self.dec_layer_dims) z_layer = enc.encode(self.input) # todo : how to handle output? _, _, self.output = dec.decode(z_layer) # todo: refactoring get theta method --> get solver? enc_theta = enc.get_theta() dec_theta = dec.get_theta() self.theta = [*enc_theta, *dec_theta] #l2_loss = enc.get_l2_loss() + self.recon_loss = tf.reduce_mean(tf.square(self.input-self.output)) self.solver = tf.train.AdamOptimizer(self.learning_rate).minimize(self.recon_loss, var_list=theta)
def run(): # paths to data content_paths = [] for c in FLAGS.contents: p = Path(c) if not p.exists(): raise ValueError('The content image or directory is not exist: {}'.format(p)) if p.is_dir(): for f in p.glob('**/*.*'): content_paths.append(f) else: content_paths.append(p) style_path = Path(FLAGS.style) if not style_path.exists(): raise ValueError('The style image is not exist: {}'.format(style_path)) # output directory output_dir = Path(FLAGS.output) / style_path.stem if output_dir.exists(): logging.warning('The folder will be deleted: {}'.format(output_dir)) rm_path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) # create model if not Path(FLAGS.decoder).exists(): raise ValueError('The decoder model is not found: {}'.format(FLAGS.decoder)) encoder = Encoder(input_shape=(None, None, 3), pretrained=True) content_feature_input = Input(shape=encoder.output_shape[-1][1:]) style_feature_input = Input(shape=encoder.output_shape[-1][1:]) adain = AdaIN(alpha=FLAGS.alpha) adain = Model(inputs=[content_feature_input, style_feature_input], outputs=[adain([content_feature_input, style_feature_input])]) decoder = Decoder(input_shape=encoder.output_shape[-1][1:]) decoder.load_weights(FLAGS.decoder) # load and encode style image style = np.expand_dims(load_image(style_path, image_shape=(FLAGS.style_size, FLAGS.style_size)), axis=0) style_feature = encoder.predict(style)[-1] for content_path in tqdm(content_paths): # load and encode content image content = load_image(content_path) content = np.expand_dims(content, axis=0) content_feature = encoder.predict(content)[-1] # normalize the feature normalized_feature = adain.predict([content_feature, style_feature]) # generate image generated = decoder.predict(normalized_feature) # save image img_path = output_dir / '{}.{}'.format(content_path.stem, FLAGS.ext) img = array_to_img(generated[0]) img.save(img_path)
def __init__(self, speaker_emb_reduction=3): super(VQVC, self).__init__() self.name = 'VQVC' self.speaker_emb_reduction = args.speaker_emb_reduction self.encoder = Encoder(mel_channels=args.n_mels, z_dim=args.z_dim) self.codebook = VQEmbeddingEMA(args.n_embeddings, args.z_dim) self.decoder = Decoder(in_channels=args.z_dim, mel_channels=args.n_mels)
def __init__(self, config, name='model'): super(Model, self).__init__(name=name) self._normal_invvar = 1 / pow(config['normal_scale'], 2) self._normal_const = math.log(2 * math.pi / self._normal_invvar) self._seg_overlap = config['seg_overlap'] with self._enter_variable_scope(check_same_graph=False): self._init = Initializer(config) self._upd = Updater(config) self._dec = Decoder(config) self._ln_grad_apc = snt.LayerNorm(axis=[-3, -2, -1], offset=False, scale=False, name='ln_grad_apc') self._ln_grad_mask = snt.LayerNorm(axis=[-3, -2, -1], offset=False, scale=False, name='ln_grad_mask') self._ln_pixel_ll = snt.LayerNorm(axis=[-3, -2, -1], offset=False, scale=False, name='ln_ll') self._ln_pixel_ll_excl = snt.LayerNorm(axis=[-3, -2, -1], offset=False, scale=False, name='ln_ll_exclude') self._ln_grad_post_param = snt.LayerNorm(axis=[-1], offset=False, scale=False, name='ln_grad_post_param')
def __init__(self, config): super(ModelBase, self).__init__() # Hyperparameters self.noise_prob = config['noise_prob'] self.seg_overlap = config['seg_overlap'] # Neural networks self.upd = Updater(config) self.dec = Decoder(config)
def infer(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) #加载词汇表 with open(args['vocab_path'], 'rb') as f: vocab = pickle.load(f) with open(args['data_path'], 'rb') as f: Data = pickle.load(f) #在测试阶段使用model.eval(),将BN和Dropout固定,使用训练好的值 encoder = Encoder(args['embed_size'], args['pooling_kernel']).eval().cuda() decoder = Decoder(args['embed_size'], args['hidden_size'], len(vocab), args['num_layers']).cuda() #加载训练时的参数 encoder.load_state_dict(torch.load(args['encoder_path'])) decoder.load_state_dict(torch.load(args['decoder_path'])) #加载图片 image = load_image(args['val_img_path'], transform, (args['resize'], args['resize'])) image_tensor = image.cuda() #送入模型并输出caption feature = encoder(image_tensor) index = decoder.sample(feature) index = index[0].cpu().numpy() #将index转化成word words = [] for ind in index: word = vocab.idx2word[word_id] words.append(word) if word == '<end>': break sentence = ' '.join(words[1:-1]) #去掉开头和结尾的特殊字符<start>,<end> print(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = Encoder(args.embed_size).eval() decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
class VQVC(nn.Module): """ VQVC Args: mels: (N, T, C) Returns: encode: z_enc: (N, T, z_dim) z_quan: (N, T, z_dim) c: (N, T, c_dim) indices: (N, forward: z_enc: (N, T, z_dim) z_quan: (N, T, z_dim) c: (N, T, c_dim) loss: (1, ) perplexity (1, ) """ def __init__(self, speaker_emb_reduction=3): super(VQVC, self).__init__() self.name = 'VQVC' self.speaker_emb_reduction = args.speaker_emb_reduction self.encoder = Encoder(mel_channels=args.n_mels, z_dim=args.z_dim) self.codebook = VQEmbeddingEMA(args.n_embeddings, args.z_dim) self.decoder = Decoder(in_channels=args.z_dim, mel_channels=args.n_mels) def average_through_time(self, x, dim): x = torch.mean(x, dim=dim, keepdim=True) return x def forward(self, mels): # encoder z_enc = self.encoder(mels) # quantization z_quan, commitment_loss, perplexity = self.codebook(z_enc) # speaker emb speaker_emb_ = z_enc - z_quan speaker_emb = self.average_through_time(speaker_emb_, dim=1) # decoder mels_hat = self.decoder(z_quan, speaker_emb) return mels_hat, commitment_loss, perplexity def evaluate(self, mels): # encoder z_enc = self.encoder(mels) # contents emb z_quan, commitment_loss, perplexity = self.codebook(z_enc) # speaker emb speaker_emb_ = z_enc - z_quan speaker_emb = self.average_through_time(speaker_emb_, dim=1) # decoder mels_hat, mels_code, mels_style = self.decoder.evaluate( z_quan, speaker_emb, speaker_emb_) return mels_hat, mels_code, mels_style, commitment_loss, perplexity def convert(self, src_mel, ref_mel): # source z_enc z_src_enc = self.encoder(src_mel) # source contents src_contents, _, _ = self.codebook(z_src_enc) # source style emb src_style_emb_ = z_src_enc - src_contents # ref z_enc ref_enc = self.encoder(ref_mel) # ref contents ref_contents, _, _ = self.codebook(ref_enc) # ref speaker emb ref_speaker_emb_ = ref_enc - ref_contents ref_speaker_emb = self.average_through_time(ref_speaker_emb_, dim=1) # decoder to generate mel mel_converted, mel_src_code, mel_src_style, mel_ref_code, mel_ref_style = self.decoder.convert( src_contents, src_style_emb_, ref_contents, ref_speaker_emb, ref_speaker_emb_) return mel_converted, mel_src_code, mel_src_style, mel_ref_code, mel_ref_style
ndf = int(opt.ndf) imageSize = int(opt.imageSize) lr = opt.lr gamma = opt.gamma def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) NetG = Decoder(nc, ngf, nz).to(device) NetD = Discriminator(imageSize, nc, ndf, nz).to(device) NetE = Encoder(imageSize, nc, ngf, nz).to(device) Sampler = Sampler().to(device) NetE.apply(weights_init) NetG.apply(weights_init) NetD.apply(weights_init) # load weights if opt.netE != '': NetE.load_state_dict(torch.load(opt.netE)) if opt.netG != '': NetG.load_state_dict(torch.load(opt.netG)) if opt.netD != '': NetD.load_state_dict(torch.load(opt.netD))
"network": torch.device(0), "images": torch.device(1), "test": torch.device(2) } csv_path = "../VOC2012/" train_path = csv_path + "train_v1.csv" test_path = csv_path + "test_v1.csv" os.makedirs(arg.save_dir, exist_ok=True) tensorboard = utils.TensorboardLogger("%s/tb" % (arg.save_dir)) E = nn.DataParallel(Encoder(), output_device=device["images"]).to(device["network"]) D = nn.DataParallel(Decoder(), output_device=device["images"]).to(device["network"]) loss = TotalLoss(device, (arg.batch_train, 3, *arg.resl)) optim = opt.Adam(list(E.parameters()) + list(D.parameters()), lr=arg.lr, betas=arg.betas) scheduler = opt.lr_scheduler.LambdaLR(optim, lr_lambda=lambda epoch: 0.965**epoch) train_loader = Loader(train_path, arg.batch_train, num_workers=arg.cpus, shuffle=True, drop_last=True)
def run(): # create directories save_dir = Path(FLAGS.save_dir) if save_dir.exists(): logging.warning('The directory can be overwritten: {}'.format( FLAGS.save_dir)) save_dir.mkdir(exist_ok=True, parents=True) log_dir = Path(FLAGS.tensorboard) if log_dir.exists(): logging.warning('The directory will be removed: {}'.format( FLAGS.tensorboard)) rm_path(log_dir) log_dir.mkdir(exist_ok=True, parents=True) # to handle errors while loading images Image.MAX_IMAGE_PIXELS = None ImageFile.LOAD_TRUNCATED_IMAGES = True # image generator dataset = ContentStyleLoader(content_root=FLAGS.content_dir, content_image_shape=(FLAGS.image_size, FLAGS.image_size), content_crop='random', content_crop_size=FLAGS.crop_size, style_root=FLAGS.style_dir, style_image_shape=(FLAGS.image_size, FLAGS.image_size), style_crop='random', style_crop_size=FLAGS.crop_size, n_per_epoch=FLAGS.dataset_size, batch_size=FLAGS.batch_size) # create model encoder = Encoder(input_shape=(FLAGS.crop_size, FLAGS.crop_size, 3), pretrained=True, name='encoder') # freeze the model for l in encoder.layers: l.trainable = False adain = AdaIN(alpha=1.0, name='adain') decoder = Decoder(input_shape=encoder.output_shape[-1][1:], name='decoder') # place holders for inputs content_input = Input(shape=(FLAGS.crop_size, FLAGS.crop_size, 3), name='content_input') style_input = Input(shape=(FLAGS.crop_size, FLAGS.crop_size, 3), name='style_input') # forwarding content_features = encoder(content_input) style_features = encoder(style_input) normalized_feature = adain([content_features[-1], style_features[-1]]) generated = decoder(normalized_feature) # loss calculation generated_features = encoder(generated) content_loss = Lambda(calculate_content_loss, name='content_loss')( [normalized_feature, generated_features[-1]]) style_loss = Lambda(calculate_style_loss, name='style_loss')( [style_features, generated_features]) loss = Lambda( lambda x: FLAGS.content_weight * x[0] + FLAGS.style_weight * x[1], name='loss')([content_loss, style_loss]) # trainer trainer = Model(inputs=[content_input, style_input], outputs=[loss]) optim = optimizers.Adam(learning_rate=FLAGS.learning_rate) trainer.compile(optimizer=optim, loss=lambda _, y_pred: y_pred) trainer.summary() # callbacks callbacks = [ # learning rate scheduler LearningRateScheduler(lambda epoch, _: FLAGS.learning_rate / ( 1.0 + FLAGS.learning_rate_decay * FLAGS.dataset_size * epoch)), # Tensor Board TensorBoard(str(log_dir), write_graph=False, update_freq='batch'), # save model SubmodelCheckpoint( str(save_dir / 'decoder.epoch-{epoch:d}.h5'), submodel_name='decoder', save_weights_only=True, save_best_only=FLAGS.save_best_only, save_freq=FLAGS.save_every if FLAGS.save_every else 'epoch') ] # train trainer.fit_generator(dataset, epochs=FLAGS.epochs, workers=FLAGS.workers, callbacks=callbacks)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = Encoder(args.embed_size).eval() decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # load validation image set lis = os.listdir(args.image_dir) num = len(lis) captions = [] for i in range(num): im_pth = os.path.join(args.image_dir, lis[i]) image = load_image(im_pth, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) cap= {} id = int(lis[i][14:-4]) #extract image id cap['image_id'] = id cap['caption'] = sentence captions.append(cap) # save results with open('captions_res.json', 'w') as f: json.dump(captions, f) # evaluation with coco-caption evaluation tools coco = COCO(args.caption_path) cocoRes = coco.loadRes('captions_res.json') cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate()
def main(args): # Create model directory for saving trained models if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, augmentation, normalization for using the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.im_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Configure the network encoder = Encoder(args.embed_size).to(device) decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # mini-batch images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, args.num_epochs, i, total_step, loss.item())) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder.ckpt')) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder.ckpt'))
def look_network(device: str): pos_encoding = PositionalEncoding(10000, 512)(torch.zeros(1, 64, 512)) plt.pcolormesh(pos_encoding[0].numpy(), cmap="RdBu") plt.xlabel("Depth") plt.xlim((0, 512)) plt.ylabel("Position") plt.colorbar() plt.show() y = torch.rand(1, 60, 512) out = ScaledDotProductAttention()(y, y, y) print("Dot Attention Shape", out[0].shape, out[1].shape) temp_mha = MultiHeadAttention(features=512, num_heads=8) out, attn = temp_mha(q=torch.rand(1, 45, 512), k=y, v=y, mask=None) print("Multi Attention Shape", out.shape, attn.shape) sample_ffn = FeedForwardNetwork(512, 2048) print("Feed Forward Shape", sample_ffn(torch.rand(64, 50, 512)).shape) sample_encoder_layer = EncoderLayer(512, 8, 2048) sample_encoder_layer_output = sample_encoder_layer(torch.rand(64, 43, 512), None) print( "Encoder Shape", sample_encoder_layer_output.shape ) # (batch_size, input_seq_len, d_model) sample_encoder_layer = EncoderLayer(512, 8, 2048) sample_encoder_layer_output = sample_encoder_layer(torch.rand(64, 50, 512), None) print( "Encoder Shape", sample_encoder_layer_output.shape ) # (batch_size, input_seq_len, d_model) sample_encoder = Encoder( num_layers=2, features=512, num_heads=8, fffeatures=2048, input_vocab_size=8500, maximum_position_encoding=10000, ).to(device) temp_input = torch.rand(64, 62).type(torch.LongTensor).to(device) sample_encoder_output = sample_encoder(temp_input, mask=None) print( "Encoder Shape", sample_encoder_output.shape ) # (batch_size, input_seq_len, d_model) sample_decoder = Decoder( num_layers=2, features=512, num_heads=8, fffeatures=2048, target_vocab_size=8500, maximum_position_encoding=10000, ).to(device) temp_input = torch.rand(64, 26).type(torch.LongTensor).to(device) output, attn = sample_decoder( temp_input, enc_output=sample_encoder_output, look_ahead_mask=None, padding_mask=None, ) print("Decoder Shape", output.shape, attn["decoder_layer2_block2"].shape)
x_train = LoadIndexDataset('./index_dataset/index_train_source_8000.txt', src_i2wDict) y_train = LoadIndexDataset('./index_dataset/index_train_target_8000.txt', src_i2wDict) x_train = x_train[:100] y_train = y_train[:100] hidden_dim = 256 BATCH_SIZE = 1 EPOCH_NUM = 10 embed_dim = 50 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(src_vocab_size, embed_dim, hidden_dim) decoder = Decoder(tag_vocab_size, embed_dim, hidden_dim) network = Net(encoder, decoder, device, teacher_forcing_ratio=0.5) loss_fn = nn.CrossEntropyLoss() #使用交叉熵损失函数 optimizer = torch.optim.Adam(network.parameters()) #使用Adam优化器 for epoch in range(EPOCH_NUM): print('*********************************') print('epoch: ', epoch + 1, 'of', EPOCH_NUM) i = 0 while i * BATCH_SIZE < len(x_train): if (i + 1) * BATCH_SIZE < len(x_train): inputs = x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] target = y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] else: inputs = x_train[i * BATCH_SIZE:]
ngf = int(opt.ngf) ndf = int(opt.ndf) imageSize = int(opt.imageSize) def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) NetE = Encoder(imageSize, nc, ngf, nz).to(device) NetG = Decoder(nc, ngf, nz).to(device) Sampler = Sampler().to(device) NetE.apply(weights_init) NetG.apply(weights_init) # load weights NetE.load_state_dict(torch.load(opt.netE, map_location=opt.cuda)) NetG.load_state_dict(torch.load(opt.netG, map_location=opt.cuda)) NetE.eval() NetG.eval() # 21 attributes attributes = [
def __init__(self, opt): super(Generator, self).__init__() self.encoder1 = Encoder(opt.ngpu, opt, opt.nz) self.decoder = Decoder(opt.ngpu, opt)
lr = opt.lr gamma = opt.gamma def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) NetE = Encoder(imageSize, nc, ngf, nz).to(device) Sampler = Sampler().to(device) NetG = Decoder(nc, ngf, nz).to(device) NetE.apply(weights_init) NetG.apply(weights_init) # load weights if opt.netE != '': NetE.load_state_dict(torch.load(opt.netE)) if opt.netG != '': NetG.load_state_dict(torch.load(opt.netG)) optimizer_encorder = optim.RMSprop(params=NetE.parameters( ), lr=lr, alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False) optimizer_decoder = optim.RMSprop(params=NetG.parameters( ), lr=lr, alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False)
latent_list = [] for i in range(5): latent_list.append( UniformLatent(in_dim=1, out_dim=1, low=-1.0, high=1.0, apply_reg=True)) latent_list.append( UniformLatent(in_dim=5, out_dim=5, low=-1.0, high=1.0, apply_reg=False)) latent = JointLatent(latent_list=latent_list) decoder = Decoder(output_width=width, output_height=height, output_depth=depth) infoGANDiscriminator = \ InfoGANDiscriminator(output_length=latent.reg_out_dim) crDiscriminator = \ CrDiscriminator(output_length=latent.num_reg_latent) checkpoint_dir = "./test/checkpoint" if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) sample_dir = "./test/sample" if not os.path.exists(sample_dir): os.makedirs(sample_dir) time_path = "./test/time.txt" metric_path = "./test/metric.csv" epoch = 40
def train(args): #数据预处理,生成vocab和data preprocess(args['cap_path'], args['vocab_path'], args['data_path']) if not os.path.exists(args['model_path']): os.mkdir(args['model_path']) #对图片进行处理,进行数据增强 transform = transforms.Compose([ transforms.Resize((args['resize'], args['resize'])), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args['vocab_path'], 'rb') as f: vocab = pickle.load(f) with open(args['data_path'], 'rb') as f: Data = pickle.load(f) data_loader = get_loader(args['train_img_path'], Data, vocab, transform, args['batch_size'], shuffle=True, num_workers=args['num_workers']) encoder = Encoder(args['embed_size'], args['pooling_kernel']).cuda() decoder = Decoder(args['embed_size'], args['hidden_size'], len(vocab), args['num_layers']).cuda() criterion = nn.CrossEntropyLoss().cuda() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args['learning_rate']) total_step = len(data_loader) for epoch in range(args['num_epochs']): for i, (images, captions, lengths) in enumerate(data_loader): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() #打印训练信息 if i % args['log_step'] == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args['num_epochs'], i, total_step, loss.item(), np.exp(loss.item()))) #保存模型 if (i + 1) % args['save_step'] == 0: torch.save( decoder.state_dict(), os.path.join(args['model_path'], 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args['model_path'], 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) #每个epoch结束也保存一次模型 torch.save( decoder.state_dict(), os.path.join(args['model_path'], 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args['model_path'], 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))