示例#1
0
    def __init__(self, fname, mode=None):
        self.fname = fname
        self.data = exh.read_file(fname)
        if mode == 'train':
            self.data = self.data[:50000]
        self.lens = [len(sentence) for sentence in self.data]

        self.size = len(self.data)
示例#2
0
def run(args):
    vocab = {}
    # vocab = {"<pad>": {"id": 0, "freq": float('inf')},
    #       "<bos>": {"id": 1, "freq": float('inf')},
    #       "<eos>": {"id": 2, "freq": float('inf')},
    #       "<unk>": {"id": 3, "freq": float('inf')}}
    # tokens = ["<pad>", "<bos>", "<eos>", "<unk>"]
    captions = exh.read_file(args.INPUT)

    n_captions = len(captions)

    id_token = 4

    for i, caption in enumerate(captions):
        for word in caption.split():
            if word in vocab:
                vocab[word]["freq"] += 1
                vocab[word]["captions"].add(i)
            else:
                # id_token = len(tokens)
                vocab[word] = dict()
                vocab[word]["freq"] = 1
                vocab[word]["id"] = id_token
                vocab[word]["captions"] = set([i])
                id_token += 1
                # tokens.append(word)

    top = sorted(vocab.items(), key=lambda x: x[1]['freq'], reverse=True)

    n_vocab = len(top)
    # print(top)
    # print(n_captions)
    # print(n_vocab)

    tots = []
    ratios = []
    x_ticks = []
    covered = set()

    for i in range(n_vocab):
        x_ticks.append(i + 1)
        covered = covered | set(top[i][1]['captions'])
        tots.append(len(covered))
        ratios.append(len(covered) / n_captions)

    plot_lines(x_ticks, [tots], ['Number of captions covered'],
               'voc_tot.png',
               'Number of words',
               'Number of captions covered',
               has_legend=False,
               step=100)
    plot_lines(x_ticks, [ratios], ['Ratio of captions covered'],
               'voc_ratio.png',
               'Number of words',
               'Ratio of captions covered',
               has_legend=False,
               step=100)
示例#3
0
def run(args):
    vocab = {
        "<pad>": {
            "id": 0,
            "freq": float('inf')
        },
        "<bos>": {
            "id": 1,
            "freq": float('inf')
        },
        "<eos>": {
            "id": 2,
            "freq": float('inf')
        },
        "<unk>": {
            "id": 3,
            "freq": float('inf')
        }
    }
    # tokens = ["<pad>", "<bos>", "<eos>", "<unk>"]
    captions = exh.read_file(args.INPUT)

    id_token = 4

    for caption in captions:
        for word in caption.split():
            if word in vocab:
                vocab[word]["freq"] += 1
            else:
                # id_token = len(tokens)
                vocab[word] = dict()
                vocab[word]["freq"] = 1
                vocab[word]["id"] = id_token
                id_token += 1
                # tokens.append(word)

    top = sorted(vocab.items(), key=lambda x: x[1]['freq'], reverse=True)
    top = top[:args.MAX + 4]

    tokens = [None] * (args.MAX + 4)
    vocab = dict()

    id_token = 4
    for word in top:
        id = word[1]['id']
        if id < 4:
            vocab[word[0]] = word[1]
            tokens[word[1]['id']] = word[0]
        else:
            vocab[word[0]] = {'id': id_token, 'freq': word[1]['freq']}
            tokens[id_token] = word[0]
            id_token += 1

    vocab["token_list"] = tokens
    exh.write_json(vocab, args.OUTPUT)
def run(args):
    captions = exh.read_file(args.INPUT)
    norm_captions = []
    for caption in captions:
        norm_caption = "".join([w for w in caption if w not in string.punctuation])
        norm_caption = norm_caption.replace("  ", " ").lower()
        if norm_caption[-1] == " ":
            norm_caption = norm_caption[:-1]
        norm_captions.append(norm_caption)
    
    filename = args.INPUT.split('/')[-1]
    folder = args.INPUT[:len(args.INPUT)-len(filename)]
    output_file = "{0}norm_{1}".format(folder, filename)
    norm_captions = "\n".join(norm_captions)
    exh.write_text(norm_captions, output_file)
示例#5
0
    def __init__(self, fname, vocab, bos=True, key=None, mode=None):
        self.fname = fname
        captions = exh.read_file(fname)
        if mode == 'train':
            captions = captions[:50000]
        self.data = []
        self.lengths = []
        self.key = key

        for caption in captions:
            tokens = uvoc.words2tokens(caption, vocab, bos)
            self.data.append(tokens)
            self.lengths.append(len(tokens))

        self.size = len(self.data)
示例#6
0
def run(args):
    # Get configuration
    config = exh.load_json(args.CONFIG)

    # Prepare folders for logging
    logging = config['logging']['activate']
    if logging:
        exh.create_directory("output")
        output = os.path.join("output", config['logging']['output_folder'])
        exh.create_directory(output)

    # Global initialization
    torch.cuda.init()
    device = torch.device(config['cuda']['device'] if (
        torch.cuda.is_available() and config['cuda']['ngpu'] > 0) else "cpu")
    seed = fix_seed(config['seed'])

    # Load vocabulary
    vocab = exh.load_json(config['data']['vocab'])

    # Prepare references
    references = exh.read_file(config['data']['beam']['captions'])
    references = prepare_references(references)

    # Prepare datasets and dataloaders
    training_dataset = CaptioningDataset(config['data']['train'], "train",
                                         vocab, config['sampler']['train'])
    train_iterator = DataLoader(
        training_dataset,
        batch_sampler=training_dataset.sampler,
        collate_fn=training_dataset.collate_fn,
        pin_memory=config['iterator']['train']['pin_memory'],
        num_workers=config['iterator']['train']['num_workers'])
    beam_dataset = CaptioningDataset(config['data']['beam'], "beam", vocab,
                                     config['sampler']['beam'])
    beam_iterator = DataLoader(
        beam_dataset,
        batch_sampler=beam_dataset.sampler,
        collate_fn=beam_dataset.collate_fn,
        pin_memory=config['iterator']['beam']['pin_memory'],
        num_workers=config['iterator']['beam']['num_workers'])

    # Prepare model
    weights = None
    if len(config['model']['embeddings']) > 0:
        weights = uvoc.init_weights(vocab, config['model']['emb_dim'])
        uvoc.glove_weights(weights, config['model']['embeddings'], vocab)

    model = WGAN(len(vocab['token_list']), config['model'], weights)
    # model = WGANBase(len(vocab['token_list']), config['model'], weights)
    # model = WGANBaseGP(len(vocab['token_list']), config['model'], weights)
    # model = WGANBaseLip(len(vocab['token_list']), config['model'], weights)
    # model = RelativisticGAN(len(vocab['token_list']), config['model'], weights)
    model.reset_parameters()

    lr = config['model']['optimizers']['lr']
    betas = (config['model']['optimizers']['betas']['min'],
             config['model']['optimizers']['betas']['max'])
    weight_decay = config['model']['optimizers']['weight_decay']

    optim_D = optim.Adam(model.D.parameters(),
                         lr=lr,
                         betas=betas,
                         weight_decay=weight_decay)
    optim_G = optim.Adam(model.G.parameters(),
                         lr=lr,
                         betas=betas,
                         weight_decay=weight_decay)

    model.to(device)

    fix_seed(config['seed'] + 1)

    generator_trained = config['model']['generator']['train_iteration']

    scores = {"BLEU": [], "G_loss_train": [], "D_loss_train": []}
    max_bleu = config['BLEU']['max_bleu']
    bleus = [[]] * max_bleu
    best_bleu = (0, 1)

    # torch.autograd.set_detect_anomaly(True)
    model.train(True)
    torch.set_grad_enabled(True)

    # for epoch in range(config['max_epoch']):
    epoch = 1
    cpt = 0
    while True:
        secs = time.time()
        print("Starting Epoch {}".format(epoch))

        iteration = 1

        d_batch = 0
        g_batch = 0
        d_loss = 0
        g_loss = 0
        for batch in train_iterator:
            batch.device(device)

            out = model(batch, optim_G, optim_D, epoch, iteration)

            d_loss += out['D_loss']
            d_batch += 1
            g_loss += out['G_loss']
            g_batch += 1

            iteration += 1

        print(
            "Training : Mean G loss : {} / Mean D loss : {} ({} seconds elapsed)"
            .format(g_loss / g_batch, d_loss / d_batch,
                    time.time() - secs))
        scores['G_loss_train'].append((g_loss / g_batch))
        scores['D_loss_train'].append((d_loss / d_batch))

        # Validation
        model.train(False)
        torch.set_grad_enabled(False)

        # Beam search
        print("Beam search...")
        # generated_sentences = beam_search(model.G, beam_iterator, vocab, config['beam_search'], device)
        # generated_sentences = beam_search([model], beam_iterator, vocab, beam_size=config['beam_search']['beam_size'], max_len=config['beam_search']['max_len'], device=device)
        generated_sentences = max_search(
            model,
            beam_iterator,
            vocab,
            max_len=config['beam_search']['max_len'],
            device=device)

        # BLEU score
        # for n in range(3,max_bleu):
        #     score = bleu_score(references, generated_sentences, n+1)
        #     bleus[n].append(score)
        #     print("BLEU-{} score : {}".format(n+1, score))
        score = bleu_score(references, generated_sentences, max_bleu)
        bleus[max_bleu - 1].append(score)
        print("BLEU-{} score : {}".format(max_bleu, score))

        if score > best_bleu[0]:
            best_bleu = (score, epoch)
            filename = 'output_epoch{}_bleu{}'.format(epoch, score)
            out_file = os.path.join(output, filename)
            torch.save(model.state_dict(), out_file)

        print("Best BLEU so far : {} (Epoch {})".format(
            best_bleu[0], best_bleu[1]))

        if logging:
            output_file = 'output_{}'.format(epoch)
            output_sentences = os.path.join(output, output_file)
            exh.write_text('\n'.join(generated_sentences), output_sentences)

        model.train(True)
        torch.set_grad_enabled(True)
        print("Epoch finished in {} seconds".format(time.time() - secs))

        if epoch - best_bleu[1] == 3:
            break

        epoch += 1

    if logging:
        scores['BLEU'] = bleus
        output_scores = os.path.join(output, 'scores.json')
        exh.write_json(scores, output_scores)
        print("Scores saved in {}".format(output_scores))
def run(args):
    print(torch.backends.cudnn.benchmark)
    torch.backends.cudnn.deterministic = True
    # Get configuration
    config = exh.load_json(args.CONFIG)

    # Global initialization
    torch.cuda.init()
    device = torch.device(config['cuda']['device'] if (
        torch.cuda.is_available() and config['cuda']['ngpu'] > 0) else "cpu")
    seed = fix_seed(config['seed'])

    # Load vocabulary
    vocab = exh.load_json(config['data']['vocab'])

    # Prepare references
    references = exh.read_file(config['data']['test']['captions'])
    references = prepare_references(references)

    beam_dataset = CaptioningDataset(config['data']['test'], "beam", vocab,
                                     config['sampler']['test'])
    beam_iterator = DataLoader(
        beam_dataset,
        batch_sampler=beam_dataset.sampler,
        collate_fn=beam_dataset.collate_fn,
        pin_memory=config['iterator']['test']['pin_memory'],
        num_workers=config['iterator']['test']['num_workers'])

    # Prepare model
    weights = None
    if len(config['model']['embeddings']) > 0:
        weights = uvoc.init_weights(vocab, config['model']['emb_dim'])
        uvoc.glove_weights(weights, config['model']['embeddings'], vocab)

    model = WGAN(len(vocab['token_list']), config['model'], weights)

    model.reset_parameters()
    print("The state dict keys: \n\n", model.state_dict().keys())

    model.load_state_dict(torch.load(config['load_dict']))
    for param in list(model.parameters()):
        param.requires_grad = False

    c = torch.load(config['load_dict'])
    for x in model.state_dict():
        if len(model.state_dict()[x].shape) == 1:
            model.state_dict()[x][:] = c[x]
        elif len(model.state_dict()[x].shape) == 2:
            model.state_dict()[x][:, :] = c[x]

    model.to(device)

    fix_seed(config['seed'] + 1)

    model.train(False)
    torch.set_grad_enabled(False)
    model.eval()

    model.G.emb.weight.data = c['G.emb.weight']

    generated_sentences = max_search(model,
                                     beam_iterator,
                                     vocab,
                                     max_len=config['beam_search']['max_len'],
                                     device=device)
    output_file = 'output_argmax'
    output_sentences = output_file
    exh.write_text('\n'.join(generated_sentences), output_sentences)
    score = bleu_score(references, generated_sentences, 4)
    print(score)
    generated_sentences = beam_search(
        [model],
        beam_iterator,
        vocab,
        beam_size=config['beam_search']['beam_size'],
        max_len=config['beam_search']['max_len'],
        device=device)
    output_file = 'output_beam'
    output_sentences = output_file
    exh.write_text('\n'.join(generated_sentences), output_sentences)
    score = bleu_score(references, generated_sentences, 4)
    print(score)