Exemplo n.º 1
0
          'rnn_dropout_prob',
          'max_gradient_norm',
          'minibatch_size',
          'beam_width',
          'geomeanpplx',
          'duration',
          sep='\t')

    datasources = data.load_datasources(corpus)
    datasources['train'] = datasources['train'].shuffle(0).take(capgen_size)

    vocab = datasources['train'].tokenize_sents().text_sents.get_vocab(
        config.min_token_freq)
    dataset = data.Dataset(
        vocab=vocab,
        train_datasource=datasources['train'],
        val_datasource=datasources['val'],
        test_datasource=capgen_test,
    )
    dataset.compile_sents()

    test_index_sents = dataset.test.index_sents

    if not lib.file_exists(config.hyperpar_dir + '/langmodtrans/' + corpus +
                           '/1_search.txt'):
        with open(config.hyperpar_dir + '/langmodtrans/' + corpus +
                  '/1_search.txt',
                  'w',
                  encoding='utf-8') as f:
            print('#',
                  'init_method',
                  'max_init_weight',
Exemplo n.º 2
0
         corpus_size = round(10**corpus_size_factor_exponent * capgen_size)
         
         full_timer = lib.Timer()
 
         datasources = data.load_datasources(corpus)
         datasources['train'] = datasources['train'].without_images().shuffle(run).take(corpus_size)
         langmod_vocab = datasources['train'].tokenize_sents().text_sents.get_vocab(config.min_token_freq)
         
         capgen_vocab = capgen_test.tokenize_sents().text_sents.get_vocab(config.min_token_freq).intersection(langmod_vocab)
         capgen_full_vocab = capgen_test.tokenize_sents().text_sents.get_vocab()
         
         capgen_num_out_of_vocab_tokens = capgen_full_vocab.size - capgen_vocab.size
         
         dataset = data.Dataset(
                 vocab            = langmod_vocab,
                 train_datasource = datasources['train'],
                 val_datasource   = datasources['val'],
             )
         dataset.compile_sents()
         
         capgen_num_unknowns_per_sent = np.sum(dataset.val.index_sents.targets == data.Vocab.UNKNOWN_INDEX, axis=1).tolist()
         
         with model_neural_trad.TradNeuralModel(
                 vocab_size              = langmod_vocab.size,
                 init_method             = langmod_init_method,
                 max_init_weight         = langmod_max_init_weight,
                 embed_size              = langmod_embed_size,
                 rnn_size                = langmod_rnn_size,
                 post_image_size         = langmod_post_image_size,
                 pre_output_size         = langmod_pre_output_size,
                 post_image_activation   = langmod_post_image_activation,
Exemplo n.º 3
0
                                             corpus_size_factor_exponent, run)
                lib.create_dir(config.results_dir + '/langmodtrans/' + corpus +
                               '/' + dir_name)

                corpus_size = round(10**corpus_size_factor_exponent *
                                    capgen_size)

                datasources = data.load_datasources(corpus)
                datasources['train'] = datasources['train'].without_images(
                ).shuffle(run).take(corpus_size)
                langmod_vocab = datasources['train'].tokenize_sents(
                ).text_sents.get_vocab(config.min_token_freq)

                dataset = data.Dataset(
                    vocab=langmod_vocab,
                    train_datasource=datasources['train'],
                    val_datasource=datasources['val'],
                    test_datasource=capgen_test,
                )
                dataset.compile_sents()

                selected_test_sents = dataset.test.shuffle(run).take(
                    one_per_group=True).tokenize_sents().compile_sents(
                        langmod_vocab)
                selected_index_sents = selected_test_sents.index_sents

                with open(config.results_dir + '/langmodtrans/' + corpus +
                          '/' + dir_name + '/1_corpus_indexes.txt',
                          'w',
                          encoding='utf-8') as f:
                    print(*dataset.train.individual_indexes, sep='\n', file=f)
Exemplo n.º 4
0
                print()
                continue

            full_timer = lib.Timer()

            dir_name = '{}_{}_{}'.format(architecture, dataset_name, run)
            lib.create_dir(config.results_dir + '/whereimage/' + architecture +
                           '/' + dir_name)

            datasources = data.load_datasources(dataset_name)
            vocab = datasources['train'].tokenize_sents().text_sents.get_vocab(
                config.min_token_freq)

            dataset = data.Dataset(
                vocab=vocab,
                train_datasource=datasources['train'],
                val_datasource=datasources['val'],
                test_datasource=datasources['test'],
            )
            dataset.compile_sents()

            selected_test_sents = dataset.test.shuffle(run).take(
                one_per_group=True).tokenize_sents().compile_sents(vocab)
            selected_index_sents = selected_test_sents.index_sents

            with open(config.results_dir + '/whereimage/' + architecture +
                      '/' + dir_name + '/selected_test.txt',
                      'w',
                      encoding='utf-8') as f:
                print(*selected_test_sents.individual_indexes,
                      sep='\n',
                      file=f)