示例#1
0
full_img_max_side = 500
thumb_img_max_side = 100

lib.create_dir(config.results_dir + '/whereimage')
lib.create_dir(config.results_dir + '/whereimage/_sample')

for dataset_name in ['mscoco']:
    print(dataset_name)

    lib.create_dir(config.results_dir + '/whereimage/_sample/' + dataset_name)
    lib.create_dir(config.results_dir + '/whereimage/_sample/' + dataset_name +
                   '/full')
    lib.create_dir(config.results_dir + '/whereimage/_sample/' + dataset_name +
                   '/thumb')

    datasources = data.load_datasources(dataset_name)

    images = datasources['test'].get_filenames()

    caps = dict()
    caps['human'] = [[
        ' '.join(sent) for sent in group
    ] for group in datasources['test'].tokenize_sents().get_text_sent_groups()]
    for architecture in ['init', 'pre', 'par', 'merge']:
        caps[architecture] = [list() for _ in range(len(images))]
        for run in range(1, config.num_runs + 1):
            dir_name = '{}_{}_{}'.format(architecture, dataset_name, run)
            with open(config.results_dir + '/whereimage/' + architecture +
                      '/' + dir_name + '/shuffled_test_indexes.txt',
                      'r',
                      encoding='utf-8') as f:
#In langmodtrans_experiment.py line 244, I mistakenly shuffled all the test sets, which I forgot was a mutable operation, leading to result files where each line corresponds to an image (wmd.txt, sents.txt, etc.) are out of sync with the actual image order in the dataset. This script generates the order of the shuffled indexes in order to allow for reordering.
from framework import lib
from framework import data
from framework import config

for run in range(1, config.num_runs + 1):
    datasources = data.load_datasources(config.langmodtrans_capgen_dataset)
    datasources['test'].shuffle(run)
    for corpus in ['lm1b', 'mscoco', 'flickr8k']:
        for frozen_prefix in [True, False]:
            for corpus_size_factor_exponent in (
                    config.langmodtrans_corpus_size_factor_exponents
                    if corpus != config.langmodtrans_capgen_dataset else
                    config.langmodtrans_corpus_size_factor_minor_exponents):
                dir_name = '{}_{}_{}'.format(frozen_prefix,
                                             corpus_size_factor_exponent, run)
                with open(config.results_dir + '/langmodtrans/' + corpus +
                          '/' + dir_name + '/shuffled_test_indexes.txt',
                          'w',
                          encoding='utf-8') as f:
                    for index in datasources['test'].take(
                            one_per_group=True).group_indexes:
                        print(index, file=f)
示例#3
0
               x.tolist() if type(x) is np.int64 else x) for x in hp]
    return new_hp


########################################################################################
def prepare_hyperpar_for_tell(hp):
    return hp


########################################################################################
if len(sys.argv) == 1:
    corpora = 'lm1b,mscoco,flickr8k'.split(',')
else:
    corpora = sys.argv[1].split(',')

datasources = data.load_datasources(config.langmodtrans_capgen_dataset)
capgen_size = datasources['train'].size
capgen_test = data.load_datasources('mscoco')['test'].shuffle(0).take(
    datasources['test'].num_groups, whole_groups=True
)  #MSCOCO test is never used in langmodtrans experiments so we can validate on it
del datasources

lib.create_dir(config.hyperpar_dir + '/langmodtrans')

for corpus in corpora:
    lib.create_dir(config.hyperpar_dir + '/langmodtrans/' + corpus)

    print('=' * 100)
    print(lib.formatted_clock())
    print(corpus, '1 (language model)')
    print()
示例#4
0
def prepare_hyperpar_for_tell(hp, architecture):
    new_hp = list(hp)
    if architecture not in ['merge', 'par', 'merge-ext']:
        new_hp[4] = None
    return new_hp


########################################################################################
if len(sys.argv) == 1:
    architectures = 'merge,par,pre,init'.split(
        ','
    )  #can also add merge-ext for a two layered softmax in merge but this was found to work badly with the current hyperparameter search resources
else:
    architectures = sys.argv[1].split(',')

datasources = data.load_datasources('flickr8k')

vocab = datasources['train'].tokenize_sents().text_sents.get_vocab(
    config.min_token_freq)
dataset = data.Dataset(
    vocab=vocab,
    train_datasource=datasources['train'],
    val_datasource=datasources['val'],
    test_datasource=data.load_datasources('mscoco')['val'].shuffle(0).take(
        datasources['test'].num_groups, whole_groups=True),
)
dataset.compile_sents()

test_images = dataset.test.get_images()
test_sents = dataset.test.get_text_sent_groups()