Exemplo n.º 1
0
from dictlearn.config_registry import ConfigRegistry

lm_config_registry = ConfigRegistry()
lm_config_registry.set_root_config({
    # data
    'data_path': 'onebillionword/',
    'dict_path': "",
    'vocab_path': "",
    'dict_vocab_path': "",
    'layout': 'standard',
    'num_input_words': 10000,
    'def_num_input_words': 0,  #0 => num_input_words
    'num_output_words': 10000,
    'max_length': 100,
    'batch_size': 64,
    'batch_size_valid': 64,
    'max_def_length': 100,
    'max_def_per_word': 1000,
    'exclude_top_k': 0,

    # model
    'emb_dim': 500,
    'emb_def_dim': 500,
    'dim': 500,
    'compose_type': 'sum',
    'disregard_word_embeddings': False,
    'learning_rate': 0.001,
    'momentum': 0.9,
    'grad_clip_threshold': 5.0,

    # embeddings
Exemplo n.º 2
0
from dictlearn.config_registry import ConfigRegistry

nli_esim_config_registry = ConfigRegistry()

# Each epoch has ~500k examples
# Params copied from https://github.com/NYU-MLL/multiNLI/blob/master/python/util/parameters.py
nli_esim_config_registry.set_root_config({
    'data_path':
    'snli',
    'layout':
    'snli',

    # Lookup params
    'max_def_per_word':
    100000,
    'emb_dim':
    300,
    'bn':
    0,
    'dim':
    300,
    'dict_path':
    '',
    'vocab':
    '',
    'vocab_text':
    '',  # Defaults to vocab. Use when original vocab cannot be used for frequency in dict
    'encoder':
    'bilstm',

    # Also used in NYU-MLI
Exemplo n.º 3
0
from dictlearn.config_registry import ConfigRegistry

configs_ae = ConfigRegistry() 

configs_ae.set_root_config({
    # data_path: not useful to use that, it's better to use FUEL_DATA_PATH
    # so that we can keep identical configs for different dictionaries
    'data_path': '', 
    # the following param was useful to run a baseline without an encoder
    # would be similar to word2vec with only one target word (the defined word)
    # this is NOT the baseline in the paper, it is weaker than word2vec
    'vocab_keys_path': '',
    'layout' : 'dict', # don't change. TODO remove this option
    # num_input_words can be set lower than the number of lines in vocab.txt
    # this allows to replace rare words with UNK (for example, if set to all the words 
    # from line 10000 on will be replaced by UNK token if it is set to 10000)
    'num_input_words' : 10000,
    # same for num_output_words: the loss will ignore words that are ranked 
    # above the value
    'num_output_words': 10000,
    # max definition length
    'max_length' : 100,
    'batch_size' : 32,
    'batch_size_valid' : 32,

    # model
    'encoder': 'lstm', # experimental code with bilstm variants (see seq2seq.py)
    'decoder': 'skip-gram', # do not change?
    # You should use emb_dim = dim unless you're playing with more experimental
    # code.
    'emb_dim' : 300, 
from dictlearn.config_registry import ConfigRegistry

qa_config_registry = ConfigRegistry()
qa_config_registry.set_root_config({
    # data
    'data_path': "",
    'dict_path': "",
    'vocab_path': "",
    'dict_vocab_path': "",
    'embedding_path': "",
    'layout': 'standard',
    'num_input_words': 10000,
    'def_num_input_words': 0,
    'max_length': 100,
    'batch_size': 32,
    'batch_size_valid': 32,

    # retrieval hacks
    'max_def_length': 1000,
    'with_too_long_defs': 'drop',
    'max_def_per_word': 1000,
    'with_too_many_defs': 'random',
    'exclude_top_k': 0,

    # model
    'def_reader': 'LSTMReadDefinitions',
    'dim': 128,
    'emb_dim': 0,
    'readout_dims': [],
    'coattention': True,
    'learning_rate': 0.001,
from dictlearn.config_registry import ConfigRegistry

snli_config_registry = ConfigRegistry()

snli_config_registry.set_root_config({
    'data_path': 'snli/',
    'layout': 'snli',

    # Lookup params
    'translate_dim': 300,
    'max_def_per_word': 100000,
    'bn': True,
    'mlp_dim': 600,
    'emb_dim': 300,  # Used for def and word lookup
    'dict_path': '',

    # Remove by default embeddings. Our goal ATM is to beat random init
    'embedding_path':
    '',  #/data/lisa/exp/jastrzes/dict_based_learning/data/snli/glove.840B.300d.npy',
    'vocab_def': '',
    'vocab_text':
    '',  # If passed will be used for exclude_top_k in Retrieval only
    'vocab': '',
    'def_dim': 300,  # LSTM reader hidden state or translate in MeanPool
    'def_emb_dim': -1,  # Dimensionality of vectors used in definitions
    'compose_type': '',
    'disregard_word_embeddings': False,
    'exclude_top_k': -1,
    'max_def_length': 50,
    'with_too_long_defs': 'drop',
    'train_emb':
from dictlearn.config_registry import ConfigRegistry

lm_config_registry = ConfigRegistry()
lm_config_registry.set_root_config({
    # data
    'data_path': "",
    'dict_path': "",
    'layout': 'standard',
    'num_input_words': 10000,
    'num_output_words': 10000,
    'max_length': 100,
    'batch_size': 32,
    'batch_size_valid': 32,
    'max_def_length': 1000,
    'exclude_top_k': -1,

    # model
    'dim': 128,
    'compose_type': 'sum',
    'standalone_def_rnn': True,
    'disregard_word_embeddings': False,
    'learning_rate': 0.001,
    'momentum': 0.9,
    'grad_clip_threshold': 5.0,

    # monitoring and checkpointing
    'mon_freq_train': 10,
    'mon_freq_valid': 1000,
    'save_freq_batches': 1000,
    'n_batches': 0,
    'monitor_parameters': False