def get_latent(args):
	device = torch.device(args.gpu)
	print("Loading embedding model...")
	with open(os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'word_embedding.p'), "rb") as f:
		text_embedding_model = cPickle.load(f)
	with open(os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'word_idx.json'), "r", encoding='utf-8') as f:
		word_idx = json.load(f)
	print("Loading embedding model completed")
	print("Loading dataset...")
	full_dataset = load_fullmultimodal_data(args, CONFIG, word2idx=word_idx[1])
	print("Loading dataset completed")
	full_loader = DataLoader(full_dataset, batch_size=args.batch_size, shuffle=False)
	
	# t1 = max_sentence_len + 2 * (args.filter_shape - 1)
	t1 = CONFIG.MAX_SENTENCE_LEN
	t2 = int(math.floor((t1 - args.filter_shape) / 2) + 1) # "2" means stride size
	t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1)
	args.t3 = t3

	text_embedding = nn.Embedding.from_pretrained(torch.FloatTensor(text_embedding_model))
	text_encoder = text_model.ConvolutionEncoder(text_embedding, t3, args.filter_size, args.filter_shape, args.encode_latent)
	imgseq_encoder = imgseq_model.RNNEncoder(args.image_embedding_dim, args.num_layer, args.encode_latent, bidirectional=True)
	multimodal_encoder = multimodal_model.MultimodalEncoder(text_encoder, imgseq_encoder, args.latent_size, args.normalize, args.add_latent)
	checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH, args.checkpoint), map_location=lambda storage, loc: storage)
	multimodal_encoder.load_state_dict(checkpoint['multimodal_encoder'])
	multimodal_encoder.to(device)
	multimodal_encoder.eval() 


	csv_name = 'latent_' + args.target_dataset
	if args.normalize:
		csv_name = csv_name + "_normalize"
	if args.add_latent:
		csv_name = csv_name + "_add_latent"
	if args.no_decode:
		csv_name = csv_name + "_no_decode"
	csv_name = csv_name + '.csv'
	#f_csv = open(os.path.join(CONFIG.CSV_PATH, 'latent_' + args.target_dataset + '.csv'), 'w', encoding='utf-8-sig')
	#wr = csv.writer(f_csv)
	short_code_list = []
	row_list = []
	for text_batch, imgseq_batch, short_code in tqdm(full_loader):
		torch.cuda.empty_cache()
		with torch.no_grad():	
			text_feature = Variable(text_batch).to(device)
			imgseq_feature = Variable(imgseq_batch).to(device)
		h = multimodal_encoder(text_feature, imgseq_feature)

		for _short_code, _h in zip(short_code, h):
			short_code_list.append(_short_code)
			row_list.append(_h.detach().cpu().numpy().tolist())
			# row = [_short_code] + _h.detach().cpu().numpy().tolist()
			# wr.writerow(row)
		del text_feature, imgseq_feature
	#f_csv.close()
	result_df = pd.DataFrame(data=row_list, index=short_code_list, columns=[i for i in range(args.latent_size)])
	result_df.index.name = "short_code"
	result_df.sort_index(inplace=True)
	result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name), encoding='utf-8-sig')
	print("Finish!!!")
예제 #2
0
def get_latent(args):
    device = torch.device(args.gpu)
    print("Loading embedding model...")
    image_embedding_model = models.__dict__[args.arch](pretrained=True)
    image_embedding_dim = image_embedding_model.fc.in_features
    args.image_embedding_dim = image_embedding_dim
    model_name = 'FASTTEXT_' + args.target_dataset + '.model'
    text_embedding_model = FastTextKeyedVectors.load(
        os.path.join(CONFIG.EMBEDDING_PATH, model_name))
    text_embedding_dim = text_embedding_model.vector_size
    args.text_embedding_dim = text_embedding_dim
    print("Building index...")
    indexer = AnnoyIndexer(text_embedding_model, 10)
    print("Loading embedding model completed")
    print("Loading dataset...")
    full_dataset = load_full_data(args,
                                  CONFIG,
                                  text_embedding_model,
                                  total=True)
    print("Loading dataset completed")
    full_loader = DataLoader(full_dataset,
                             batch_size=args.batch_size,
                             shuffle=False)

    # t1 = max_sentence_len + 2 * (args.filter_shape - 1)
    t1 = CONFIG.MAX_SENTENCE_LEN
    t2 = int(math.floor(
        (t1 - args.filter_shape) / 2) + 1)  # "2" means stride size
    t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1)
    args.t3 = t3

    text_encoder = text_model.ConvolutionEncoder(text_embedding_dim, t3,
                                                 args.filter_size,
                                                 args.filter_shape,
                                                 args.latent_size)
    text_decoder = text_model.DeconvolutionDecoder(text_embedding_dim, t3,
                                                   args.filter_size,
                                                   args.filter_shape,
                                                   args.latent_size)
    imgseq_encoder = imgseq_model.RNNEncoder(image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    imgseq_decoder = imgseq_model.RNNDecoder(image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                         args.checkpoint),
                            map_location=lambda storage, loc: storage)
    multimodal_encoder = multimodal_model.MultimodalEncoder(
        text_encoder, imgseq_encoder, args.latent_size)
    multimodal_encoder.load_state_dict(checkpoint['multimodal_encoder'])
    multimodal_encoder.to(device)
    multimodal_encoder.eval()

    f_csv = open(os.path.join(CONFIG.CSV_PATH, 'latent_features.csv'),
                 'w',
                 encoding='utf-8')
    wr = csv.writer(f_csv)
    for steps, (text_batch, imgseq_batch,
                short_code) in enumerate(full_loader):
        torch.cuda.empty_cache()
        with torch.no_grad():
            text_feature = Variable(text_batch).to(device)
            imgseq_feature = Variable(imgseq_batch).to(device)
        h = multimodal_encoder(text_feature, imgseq_feature)
        row = [short_code] + h.detach().cpu().numpy().tolist()
        wr.writerow(row)
        del text_feature, imgseq_feature
    f_csv.close()
    print("Finish!!!")
def get_latent(args):
    device = torch.device(args.gpu)
    print("Loading embedding model...")
    with open(
            os.path.join(CONFIG.DATASET_PATH, args.target_dataset,
                         'word_embedding.p'), "rb") as f:
        text_embedding_model = cPickle.load(f)
    with open(os.path.join(CONFIG.DATASET_PATH, args.target_dataset,
                           'word_idx.json'),
              "r",
              encoding='utf-8') as f:
        word_idx = json.load(f)
    print("Loading embedding model completed")
    print("Loading dataset...")
    full_dataset = load_fullmultimodal_data(args, CONFIG, word2idx=word_idx[1])
    print("Loading dataset completed")
    full_loader = DataLoader(full_dataset,
                             batch_size=args.batch_size,
                             shuffle=False)

    # t1 = max_sentence_len + 2 * (args.filter_shape - 1)
    t1 = CONFIG.MAX_SENTENCE_LEN
    t2 = int(math.floor(
        (t1 - args.filter_shape) / 2) + 1)  # "2" means stride size
    t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1)
    args.t3 = t3

    text_embedding = nn.Embedding.from_pretrained(
        torch.FloatTensor(text_embedding_model))
    text_encoder = text_model.ConvolutionEncoder(text_embedding, t3,
                                                 args.filter_size,
                                                 args.filter_shape,
                                                 args.latent_size)
    text_decoder = text_model.DeconvolutionDecoder(text_embedding, args.tau,
                                                   t3, args.filter_size,
                                                   args.filter_shape,
                                                   args.latent_size, device)
    imgseq_encoder = imgseq_model.RNNEncoder(args.image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    imgseq_decoder = imgseq_model.RNNDecoder(CONFIG.MAX_SEQUENCE_LEN,
                                             args.image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    multimodal_encoder = multimodal_model.MultimodalEncoder(
        text_encoder, imgseq_encoder, args.latent_size)
    checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                         args.checkpoint),
                            map_location=lambda storage, loc: storage)
    multimodal_encoder.load_state_dict(checkpoint['multimodal_encoder'])
    multimodal_encoder.to(device)
    multimodal_encoder.eval()

    f_csv = open(os.path.join(CONFIG.CSV_PATH,
                              'latent_' + args.target_dataset + '.csv'),
                 'w',
                 encoding='utf-8-sig')
    wr = csv.writer(f_csv)
    for text_batch, imgseq_batch, short_code in tqdm(full_loader):
        torch.cuda.empty_cache()
        with torch.no_grad():
            text_feature = Variable(text_batch).to(device)
            imgseq_feature = Variable(imgseq_batch).to(device)
        h = multimodal_encoder(text_feature, imgseq_feature)

        for _short_code, _h in zip(short_code, h):
            row = [_short_code] + _h.detach().cpu().numpy().tolist()
            wr.writerow(row)
        del text_feature, imgseq_feature
    f_csv.close()
    print("Finish!!!")
def train_reconstruction(args):
    device = torch.device(args.gpu)
    print("Loading embedding model...")
    with open(
            os.path.join(CONFIG.DATASET_PATH, args.target_dataset,
                         'word_embedding.p'), "rb") as f:
        text_embedding_model = cPickle.load(f)
    with open(os.path.join(CONFIG.DATASET_PATH, args.target_dataset,
                           'word_idx.json'),
              "r",
              encoding='utf-8') as f:
        word_idx = json.load(f)
    print("Loading embedding model completed")
    print("Loading dataset...")
    train_dataset, val_dataset = load_multimodal_data(args,
                                                      CONFIG,
                                                      word2idx=word_idx[1])
    print("Loading dataset completed")
    train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\
             DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)

    # t1 = max_sentence_len + 2 * (args.filter_shape - 1)
    t1 = CONFIG.MAX_SENTENCE_LEN
    t2 = int(math.floor(
        (t1 - args.filter_shape) / 2) + 1)  # "2" means stride size
    t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1)
    args.t3 = t3
    text_embedding = nn.Embedding.from_pretrained(
        torch.FloatTensor(text_embedding_model))

    text_encoder = text_model.ConvolutionEncoder(text_embedding, t3,
                                                 args.filter_size,
                                                 args.filter_shape,
                                                 args.encode_latent)
    imgseq_encoder = imgseq_model.RNNEncoder(args.image_embedding_dim,
                                             args.num_layer,
                                             args.encode_latent,
                                             bidirectional=True)
    text_decoder = text_model.DeconvolutionDecoder(text_embedding, args.tau,
                                                   t3, args.filter_size,
                                                   args.filter_shape,
                                                   args.decode_latent, device)
    imgseq_decoder = imgseq_model.RNNDecoder(CONFIG.MAX_SEQUENCE_LEN,
                                             args.image_embedding_dim,
                                             args.num_layer,
                                             args.decode_latent,
                                             bidirectional=True)

    if args.pretrained:
        text_encoder_checkpoint = torch.load(
            os.path.join(CONFIG.CHECKPOINT_PATH,
                         ("text_autoencoder_" + str(args.encode_latent) +
                          "_epoch_100.pt")),
            map_location=lambda storage, loc: storage)
        text_encoder.load_state_dict(text_encoder_checkpoint['text_encoder'])
        del text_encoder_checkpoint
        text_decoder_checkpoint = torch.load(
            os.path.join(CONFIG.CHECKPOINT_PATH,
                         ("text_autoencoder_" + str(args.decode_latent) +
                          "_epoch_100.pt")),
            map_location=lambda storage, loc: storage)
        text_decoder.load_state_dict(text_decoder_checkpoint['text_decoder'])
        del text_decoder_checkpoint
        imgseq_encoder_checkpoint = torch.load(
            os.path.join(CONFIG.CHECKPOINT_PATH,
                         ("imgseq_autoencoder_" + str(args.encode_latent) +
                          "_epoch_100.pt")),
            map_location=lambda storage, loc: storage)
        imgseq_encoder.load_state_dict(
            imgseq_encoder_checkpoint['imgseq_encoder'])
        del imgseq_encoder_checkpoint
        imgseq_decoder_checkpoint = torch.load(
            os.path.join(CONFIG.CHECKPOINT_PATH,
                         ("imgseq_autoencoder_" + str(args.decode_latent) +
                          "_epoch_100.pt")),
            map_location=lambda storage, loc: storage)
        imgseq_decoder.load_state_dict(
            imgseq_decoder_checkpoint['imgseq_decoder'])
        del imgseq_decoder_checkpoint

    multimodal_encoder = multimodal_model.MultimodalEncoder(
        text_encoder, imgseq_encoder, args.latent_size, args.normalize,
        args.add_latent)
    multimodal_decoder = multimodal_model.MultimodalDecoder(
        text_decoder, imgseq_decoder, args.latent_size,
        CONFIG.MAX_SEQUENCE_LEN, args.no_decode)

    if args.resume:
        print("Restart from checkpoint")
        checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                             args.resume),
                                map_location=lambda storage, loc: storage)
        start_epoch = checkpoint['epoch']
        multimodal_encoder.load_state_dict(checkpoint['multimodal_encoder'])
        multimodal_decoder.load_state_dict(checkpoint['multimodal_decoder'])
    else:
        print("Start from initial")
        start_epoch = 0

    multimodal_autoencoder = multimodal_model.MultimodalAutoEncoder(
        multimodal_encoder, multimodal_decoder)
    text_criterion = nn.NLLLoss().to(device)
    imgseq_criterion = nn.MSELoss().to(device)
    multimodal_autoencoder.to(device)

    optimizer = AdamW(multimodal_autoencoder.parameters(),
                      lr=1.,
                      weight_decay=args.weight_decay,
                      amsgrad=True)
    step_size = args.half_cycle_interval * len(train_loader)
    clr = cyclical_lr(step_size,
                      min_lr=args.lr,
                      max_lr=args.lr * args.lr_factor)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr])
    if args.resume:
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])

    exp_name = "Multimodal autoencoder"
    if args.normalize:
        exp_name = exp_name + "normalize"
    if args.add_latent:
        exp_name = exp_name + "add_latent"
    if args.no_decode:
        exp_name = exp_name + "no_decode"
    exp = Experiment(exp_name, capture_io=False)

    for arg, value in vars(args).items():
        exp.param(arg, value)
    try:
        multimodal_autoencoder.train()

        for epoch in range(start_epoch, args.epochs):
            print("Epoch: {}".format(epoch))
            for steps, (text_batch, imgseq_batch) in enumerate(train_loader):
                torch.cuda.empty_cache()
                text_feature = Variable(text_batch).to(device)
                imgseq_feature = Variable(imgseq_batch).to(device)
                optimizer.zero_grad()
                text_prob, imgseq_feature_hat = multimodal_autoencoder(
                    text_feature, imgseq_feature)
                text_loss = text_criterion(text_prob.transpose(1, 2),
                                           text_feature)
                imgseq_loss = imgseq_criterion(imgseq_feature_hat,
                                               imgseq_feature)
                loss = text_loss + imgseq_loss
                del text_loss, imgseq_loss
                loss.backward()
                optimizer.step()
                scheduler.step()

                if (steps * args.batch_size) % args.log_interval == 0:
                    input_data = text_feature[0]
                    single_data = text_prob[0]
                    _, predict_index = torch.max(single_data, 1)
                    input_sentence = util.transform_idx2word(
                        input_data.detach().cpu().numpy(),
                        idx2word=word_idx[0])
                    predict_sentence = util.transform_idx2word(
                        predict_index.detach().cpu().numpy(),
                        idx2word=word_idx[0])
                    print("Epoch: {} at {} lr: {}".format(
                        epoch, str(datetime.datetime.now()),
                        str(scheduler.get_lr())))
                    print("Steps: {}".format(steps))
                    print("Loss: {}".format(loss.detach().item()))
                    print("Input Sentence:")
                    print(input_sentence)
                    print("Output Sentence:")
                    print(predict_sentence)
                    del input_data, single_data, _, predict_index
                del text_feature, text_prob, imgseq_feature, imgseq_feature_hat, loss

            exp.log("\nEpoch: {} at {} lr: {}".format(
                epoch, str(datetime.datetime.now()), str(scheduler.get_lr())))
            _avg_text_loss, _avg_imgseq_loss, _avg_loss, _rouge_1, _rouge_2 = eval_reconstruction_with_rouge(
                multimodal_autoencoder, word_idx[0], text_criterion,
                imgseq_criterion, val_loader, device)
            exp.log(
                "\nEvaluation - text_loss: {} imgseq_loss: {} loss: {}  Rouge1: {} Rouge2: {}"
                .format(_avg_text_loss, _avg_imgseq_loss, _avg_loss, _rouge_1,
                        _rouge_2))

            save_name = "multimodal_autoencoder"
            if args.normalize:
                save_name = save_name + "_normalize"
            if args.add_latent:
                save_name = save_name + "_add_latent"
            if args.no_decode:
                save_name = save_name + "_no_decode"

            util.save_models(
                {
                    'epoch': epoch + 1,
                    'multimodal_encoder': multimodal_encoder.state_dict(),
                    'multimodal_decoder': multimodal_decoder.state_dict(),
                    'avg_loss': _avg_loss,
                    'Rouge1:': _rouge_1,
                    'Rouge2': _rouge_2,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }, CONFIG.CHECKPOINT_PATH, save_name)

        print("Finish!!!")

    finally:
        exp.end()