示例#1
0
def main(*_fold_info, **kwargs):
    args = rasengan.Namespace()
    args = args_creation_part1(args)
    args = update_args(args, kwargs)
    data = transducer_data.main(args)
    args = args_creation_part2(args, data)
    args = update_args(args, kwargs)
    args.diff_kwargs = kwargs

    # Start each run in a fresh directory to avoid interference
    # with other running processes and avoiding overwriting results of
    # previous processes.
    import os
    idx = 0
    while os.path.exists(args.folder + '_' + str(idx)):
        idx += 1
    args.folder = args.folder + '_' + str(idx)
    print 'Set args.folder to', args.folder
    if __name__ != '__main__':
        # We are probably running from the hpolibrary.
        # In non-interactive batch runs, it is also important to
        # disable debug_support so that we dont call post-mortem in
        # case of an exception.
        rasengan.disable_debug_support()
    else:
        #rasengan.warn('NOTE: I am using pretrained pkl')
        # args.pretrained_param_pklfile = (args.folder + r'/' + args.pkl_name)
        pass
    with lstm_seqlabel_circuit_compilation.make(args, force=True):
        error = lstm_seqlabel_circuit_compilation.perform_training_and_testing(
            "", args, data)
    return error
def get_train_test_namespace(args):
    if args.perform_training:
        print 'Compiling train_model'
        train_model = compile_args(args)

    set_dropout_to_zero(args)
    print 'Compiling test_model'
    test_model = compile_args(args)
    # Prepare the `ttns` namespace by adding train and test prefixes.
    ttns = rasengan.Namespace('ttns')
    if args.perform_training:
        ttns = ttns.update_and_append_prefix(train_model, 'train_')
    ttns = ttns.update_and_append_prefix(test_model, 'test_')
    return ttns
示例#3
0
'''
| Filename    : test_conjunctivemixture.py
| Description : Test ConjunctiveMixture Chip
| Author      : Pushpendre Rastogi
| Created     : Mon Nov 16 01:00:45 2015 (-0500)
| Last-Updated: Mon Nov 16 01:08:20 2015 (-0500)
|           By: Pushpendre Rastogi
|     Update #: 3
'''
import rasengan
import lstm_seqlabel_circuit
import lstm_seqlabel_circuit_compilation
import util_lstm_seqlabel

args = rasengan.Namespace()
args.conjmix_embed_BOS = 1
args.conjmix_clip_gradient = 0
chips = [(lstm_seqlabel_circuit.ConjunctiveMixture, 'conjmix')]
with util_lstm_seqlabel.debug_support():
    ttns = lstm_seqlabel_circuit_compilation.get_train_test_namespace(args)

# Test value of ttns
def main(args):
    with rasengan.debug_support():
        with rasengan.tictoc("Loading Data"):
            data_list = rasengan.namespacer(read_data(args.train_fn))
            val_data_list = rasengan.namespacer(read_data(args.dev_fn))
            if args.partition_dev_into_train > 0:
                lim = args.partition_dev_into_test
                data_list.extend(val_data_list[lim:])
                val_data_list = val_data_list[:lim]

            if args.partition_dev_into_test > 0:
                lim = args.partition_dev_into_test
                test_data_list = val_data_list[lim:]
                val_data_list = val_data_list[:lim]
            else:
                test_data_list = rasengan.namespacer(read_data(args.test_fn))

            # data_list = val_data_list = [(u'jason', u'eisner')]
            lst_char = get_lst_char(data_list + val_data_list + test_data_list)
            data_list = add_bos(data_list)
            val_data_list = add_bos(val_data_list)
            test_data_list = add_bos(test_data_list)
            warnings.warn('''
            NOTE: While preparing sigma, we add 1 to the index
            returned by enumerate because the transducer unit that
            Ryan wrote uses index 0 as the index for the epsilon
            symbol. So essentially the epsilon symbol and the
            integer 0 are reserved symbols that cannot appear in the
            vocabulary.

            ALSO, we need to add 1 to the vocsize because of that.
            ''')
            # sigma :: char -> int
            sigma = dict((b, a + 1) for (a, b) in enumerate(lst_char))

            # sigma_inv :: int -> char
            sigma_inv = dict((a + 1, b) for (a, b) in enumerate(lst_char))

            if args.limit_corpus > 0:
                data_list = data_list[:args.limit_corpus]

            train_data = numerize(data_list, sigma, args.win)
            val_data = numerize(val_data_list, sigma, args.win)
            test_data = numerize(test_data_list, sigma, args.win)

            data = rasengan.Namespace()

            #-------------------------------------------------------------#
            # Add sets that would be used by the tensorflow seq2seq       #
            # model. See~$PY/tensorflow/models/rnn/translate/translate.py #
            #-------------------------------------------------------------#
            data.train_data = data_list
            data.val_data = val_data_list
            data.test_data = test_data_list

            data.train_set = train_data
            data.dev_set = val_data
            data.test_set = test_data

            data.vocsize = len(sigma) + 1
            data.idx2label = sigma_inv
            data.label2idx = sigma

            data.train_lex = [e[0] for e in train_data]
            data.train_y = [e[1] for e in train_data]

            data.valid_lex = [e[0] for e in val_data]
            data.valid_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in val_data], data.idx2label)

            data.test_lex = [e[0] for e in test_data]
            data.test_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in test_data], data.idx2label)

            data.words_train = []
            data.words_valid = []
            data.words_test = []
    return data