def perform_training_and_testing(training_stage, args, data):
    '''
    Returns
    -------
    The validation error. A quantity that we want to minimize.
    '''
    stats = None
    with rasengan.tictoc(training_stage):
        with rasengan.debug_support():
            if args.perform_training or args.perform_testing:
                with rasengan.tictoc("Circuit Compilation"):
                    ttns = get_train_test_namespace(args)
                with rasengan.tictoc("Loading Parameters"):
                    load_params_from_pklfile(ttns, args)
                pass
            rasengan.decrease_print_indent()
            print_pklfn_performance(args)
            rasengan.increase_print_indent()
            # Train
            if args.perform_training:
                with rasengan.tictoc("Training"):
                    stats = lstm_seqlabel_training.training(args, data, ttns)
            # Test (IF asked)
            if args.perform_testing:
                with rasengan.tictoc("Testing"):
                    stats = lstm_seqlabel_validation.testing(args, data, ttns)
                    return (100 - stats)
    if stats is None:
        return 100
    else:
        best_epoch_id = stats['best_epoch_id']
        return (100 - stats['validation_result'][best_epoch_id]['f1'])
def perform_training_and_testing(training_stage, args, data):
    '''
    Returns
    -------
    The validation error. A quantity that we want to minimize.
    '''
    stats = None
    with rasengan.tictoc(training_stage):
        with rasengan.debug_support():
            if args.perform_training or args.perform_testing:
                with rasengan.tictoc("Circuit Compilation"):
                    ttns = get_train_test_namespace(args)
                with rasengan.tictoc("Loading Parameters"):
                    load_params_from_pklfile(ttns, args)
                pass
            rasengan.decrease_print_indent()
            print_pklfn_performance(args)
            rasengan.increase_print_indent()
            # Train
            if args.perform_training:
                with rasengan.tictoc("Training"):
                    stats = lstm_seqlabel_training.training(args, data, ttns)
            # Test (IF asked)
            if args.perform_testing:
                with rasengan.tictoc("Testing"):
                    stats = lstm_seqlabel_validation.testing(args, data, ttns)
                    return (100 - stats)
    if stats is None:
        return 100
    else:
        best_epoch_id = stats['best_epoch_id']
        return (100 - stats['validation_result'][best_epoch_id]['f1'])
Exemplo n.º 3
0
def main():
    import transducer_score
    args = transducer_score.args
    set_dropout_to_zero(args)
    data = transducer_score.data
    #--------------------------#
    # Compile disparate models #
    #--------------------------#
    models = []
    for pkl_fn, changes in pkl_to_combine:
        args_clone = rasengan.Namespace(**args)
        #--------------------#
        # Update args_clone. #
        #--------------------#
        rasengan.warn('NOTE: Seting pretrained_param_pklfile')
        args_clone.pretrained_param_pklfile = pkl_fn
        for (k,v) in changes.items():
            setattr(args_clone, k, v)
            print 'Setting args_clone.%s=%s'%(k,str(v))
        #---------------------#
        # Compile args_clone. #
        #---------------------#
        ttns_i = rasengan.Namespace('ttns').update_and_append_prefix(
            compile_args(args_clone), 'test_')
        load_params_from_pklfile_to_stack_config(
            pkl_fn, ttns_i.test_stack_config)
        models.append(ttns_i)

    #----------------------------#
    # Aggregate disparate model. #
    #----------------------------#
    ttns = Aggregator(models, data)
    #-----------------------------------------------#
    # Test performance of Aggregated decision rule. #
    #-----------------------------------------------#
    with rasengan.debug_support():
        stats_valid = args.validate_predictions_f(
            data.valid_lex,
            data.idx2label,
            args,
            ttns.test_f_classify,
            data.valid_y,
            data.words_valid,
            fn='/combined.valid.txt')
        print 'stats_valid', stats_valid
Exemplo n.º 4
0
 def test_entity_descriptors(self):
     with rasengan.debug_support():
         entity_descriptors = catpeople_preprocessor.entity_descriptors
         from test_identify_governors import l
         def decode(s, idi):
             return TM[[s[_] for _ in idi]]
         sentence, parent, label, ctags = (
             # 0,            1,        2,       3,     4,      5,      6,      7,      8,       9,       10,    11,     12,        13,      14,   15,       16,    17,     18,     19,      20,       21,      22,   23,     24,
             TM(['the',      'musical','august','30th',',',    '2009', '|',    'author',':', 'operator','shrek','the',  'musical','is',     'a',  'musical','with','music','by',  'jeanine','tesori','and',   'a',   'book', 'and', 'lyrics','by',  'david','lindsay-abaire','.']),
             (4,             4,        4,       16,     4,      4,      4,     4,       4,     11,       4,     13,     16,        16,      16,   0,        16,    17,     18,     21,      19,       18,      24,    18,     24,    24,      24,    29,     27,             16),
             LABELMAP(['det','amod',   'amod',  'dep', 'punct','amod','punct','appos','punct', 'nn',    'dep',  'det',  'nsubj',   'cop',   'det','ROOT',   'prep','pobj', 'prep', 'nn',    'pobj',   'cc',    'det','conj', 'cc',  'conj',  'prep', 'nn',  'pobj',          'punct']),
             CTMAP(['DET',   'ADJ',    'ADJ',   'NOUN','.',    'NUM', '.',    'NOUN',  '.',    'NOUN',  'VERB', 'DET',  'NOUN',    'VERB',  'DET','ADJ',    'ADP', 'NOUN', 'ADP',  'NOUN',  'NOUN',   'CONJ',  'DET','NOUN', 'CONJ','NOUN',  'ADP',  'NOUN','NOUN',          '.']))
         referents = [27, 28]
         self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ['book'])
         referents = [19,20]
         self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ['music'])
         referents = [10,11,12]
         self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ['music', 'jeanine', 'tesori', 'musical'])
         expected_output = [['executive'],
                            ['family', 'book'],
                            ['lied'],
                            ['missing'],
                            ['66th', 'united', 'states', 'secretary', 'state'],
                            ['left'],
                            ['efforts', '750-page', 'book'],
                            ['ousted', 'board', 'directors', 'hp'],
                            [],
                            ['stressed', 'points', 'book'],
                            ['dabbled', 'politics', 'dismissed', 'board'],
                            [],
                            ['attended', 'dinner'],
                            ['told', 'ceo', 'troops', 'hewlett', 'packard'],
                            ['looks', 'beautiful'],
                            ['knew'],
                            ['writes', 'career'],]
         for referents, parse, ep in zip(l[0::2], l[1::2], expected_output):
             sentence, parent, label, ctags = convert(parse)
             self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ep)
     return
def main():
    if args.out_fn is None:
        basename = ''.join(
            [translate(get_ppcfg_title(e)[1]) for e in args.pptitle])
        args.out_fn = 'figures/%s.pdf' % (basename)

    with debug_support():
        fig = plt.figure(
            figsize=(args.figsize_x,
                     args.figsize_y))  # give plots a rectangular frame
        ax = fig.add_subplot(111)
        label_to_artists = {}
        cm = plt.get_cmap(args.cmap)
        for pptitle_idx, pptitle in enumerate(args.pptitle):
            ppcfg, title = get_ppcfg_title(pptitle)
            aupr, mrr, _, C, shape = get_stats(ppcfg,
                                               expcfg_str=args.expcfg_str)
            for (a, m, c_, s) in zip(aupr, mrr, C, shape):
                c = cm(.1 * pptitle_idx)
                print a, m, c, c_, s
                if s == 'circle':
                    label = 'Test %s' % title
                    label_to_artists[label] = ax.add_artist(
                        plt.Circle((a, m),
                                   .005,
                                   color=c,
                                   alpha=0.7,
                                   label=label))
                else:
                    label = 'Train %s' % title
                    label_to_artists[label] = ax.add_artist(
                        plt.Rectangle((a - .005, m - .005),
                                      0.01,
                                      0.01,
                                      color=c,
                                      alpha=0.7,
                                      label=label))
                plt.text(
                    a + .01,
                    m + .000 * round(rand()),
                    '%s %.1f' % (title.replace('Hinge ', ''), c_),
                    fontsize=2,
                    # verticalalignment='top',
                    alpha=0.7)
            plt.xlim(xmin=min(aupr) - 0.05 if args.xmin is None else args.xmin)
            plt.ylim(ymin=min(mrr) - 0.05 if args.ymin is None else args.ymin)
            plt.xlabel('AUPR')
            plt.ylabel('MRR')
            plt.title('Various Feature Sets at Different C')
            plt.grid(True)
            continue
        label, handle = zip(
            *sorted(label_to_artists.items(), key=lambda x: x[0]))
        plt.legend([getline2d(e) for e in handle],
                   label,
                   loc='lower right',
                   numpoints=1)
        pass
    print 'Saving file', args.out_fn
    plt.savefig(args.out_fn)
    plt.close()
    return
Exemplo n.º 6
0
        dcr2emb[e] = scale_to_unit(dcr2emb[e])
    cat2mode = get_cat2mode()
    CONSTANT = (lambda x, t: 1)
    COUNT = (lambda x, t: x)
    LOG_COUNT = (lambda x, t: math.log(1 + x))
    SQRT_COUNT = (lambda x, t: math.sqrt(x))
    FREQ = (lambda x, t: float(x + 1) / (t + 1))
    SQ_FREQ = (lambda x, t: (float(x + 1) / (t + 1))**2)
    SQRT_FREQ = (lambda x, t: math.sqrt(float(x + 1) / (t + 1)))
    PROD_SQRT_FREQ_SQRT_COUNT = (
        lambda x, t: SQRT_COUNT(x, t) * SQRT_FREQ(x, t))
    GM_SQRT_FREQ_SQRT_COUNT = (
        lambda x, t: math.sqrt(SQRT_COUNT(x, t) * SQRT_FREQ(x, t)))
    cnt_transform = eval(args.cnt_transform)

with debug_support():

    def intervene_modes_hook(cat, modes):
        index_to_remove = []
        if cat == '20th-century_women_writers':
            index_to_remove = [1, 4]
        elif cat == 'American_television_reporters_and_correspondents':
            index_to_remove = [1, 4]
        elif cat == 'Recipients_of_the_Purple_Heart_medal':
            index_to_remove = [2, 3, 4]
        elif cat == 'United_States_Army_soldiers':
            index_to_remove = [2, 4]
        modes = [e for i, e in enumerate(modes) if i not in index_to_remove]
        return modes

    idi_list = []
Exemplo n.º 7
0
                import readline, code
                print pkl.keys()
                code.InteractiveConsole(pkl).interact()


if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(
        description='Tabulate performance of saved model files.')
    arg_parser.add_argument('--path',
                            nargs='*',
                            default=[],
                            help='A glob of the paths to the pkls')
    arg_parser.add_argument('--interact',
                            default=0,
                            type=int,
                            help='Default={0}')
    arg_parser.add_argument('--keys',
                            nargs='*',
                            default=[],
                            help='Default={0}')
    arg_parser.add_argument('--server',
                            default=0,
                            type=int,
                            help='Default={0}')
    arg_parser.add_argument('--client',
                            default=0,
                            type=int,
                            help='Default={0}')
    with rasengan.debug_support():
        main(args=arg_parser.parse_args())
Exemplo n.º 8
0
def main():
    import argparse
    arg_parser = argparse.ArgumentParser(description='')
    arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}')
    arg_parser.add_argument('--emb_pkl_fn',
                            default='data/demonstrate_similarity_idea.emb.pkl',
                            type=str)
    arg_parser.add_argument(
        '--feat_file',
        default='data/random/details/89c0c894.American_women_writers',
        type=str)
    arg_parser.add_argument('--ctag', default=None, type=int)
    arg_parser.add_argument('--mode_count', default=5, type=int)
    arg_parser.add_argument('--method',
                            default='fast_relax',
                            type=str,
                            choices=[
                                'brute_force', 'fast_relax', 'annealed_gibbs',
                                'maxproduct-bp', 'variational_inference',
                                'dc_programming'
                            ])
    args = arg_parser.parse_args()
    import random
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    cfg.mode_count = args.mode_count
    tags_to_remove = defaultdict(list)
    with rasengan.tictoc('Loading pkl'):
        embeddings = pkl.load(open(args.emb_pkl_fn))
        if cfg.introduce_NULL_embedding:
            embeddings[cfg.NULL_KEY] = numpy.zeros(
                next(embeddings.itervalues()).shape)
    with rasengan.debug_support():
        for mode_idx in range(cfg.mode_count):
            print 'mode_idx=', mode_idx
            entity_tags = {}
            entities = []
            for row in open(args.feat_file):
                _e, _tags = [e.strip() for e in row.strip().split('|||')]
                entities.append(_e)
                entity_tags[_e] = set([
                    t.lower()
                    for t in (e.strip().split(':')[0] for e in _tags.split())
                    if t.lower() in embeddings
                ])
            total_tags = set(
                rasengan.flatten([list(e) for e in entity_tags.values()]))
            assert all(e in embeddings for e in total_tags)
            print(
                'For each of these people our goal is to select one word.'
                ' That word should be as similar to other words picked for other'
                ' entities as possible')

            problem = rasengan.OrderedDict_Indexable_By_StringKey_Or_Index()
            for (a, b) in entity_tags.items():
                b = list(b)
                print 'Entity: ', a, 'tags to remove: ', tags_to_remove[a]
                for ttr in tags_to_remove[a]:
                    tolerant_remove(b, ttr)
                if cfg.introduce_NULL_embedding and cfg.NULL_KEY not in b:
                    b.append(cfg.NULL_KEY)
                # print '%-25s' % a, '|||', ', '.join(b)
                problem[a] = DataFrame(data=numpy.concatenate(
                    [(scale_to_unit(embeddings[e])
                      if cfg.scale_to_unit else embeddings[e])[None, :]
                     for e in b],
                    axis=0),
                                       index=b)
            if args.ctag is None:
                initial_assignment = dict(
                    (__a, 0) for __b, __a in enumerate(entities))
            else:
                ctag = 'war'.split()[args.ctag]
                initial_assignment = dict(
                    (__e,
                     (cfg.NULL_KEY if ctag not in entity_tags[__e] else ctag))
                    for __e in entities)
            print 'Initial chosen tags::', chosen_tags(problem,
                                                       initial_assignment)
            initial_objective = dp_objective_efficient_impl(
                problem, initial_assignment)
            print 'initial_objective=', initial_objective
            assert numpy.isclose(
                dp_objective_naive_impl(problem, initial_assignment),
                initial_objective)
            final_assignment = optimize_assignment(problem,
                                                   initial_assignment,
                                                   method=args.method)
            final_objective = dp_objective_efficient_impl(
                problem, final_assignment)
            for (fa_entity, fa_tag_idx) in final_assignment.iteritems():
                tags_to_remove[fa_entity].append(
                    liloc(problem[fa_entity], fa_tag_idx).name)
            print 'mode_idx=', mode_idx,
            print 'initial_objective=', initial_objective,
            print 'final_objective=', final_objective,
            print 'Final chosen tags=', chosen_tags(problem, final_assignment)
    return
Exemplo n.º 9
0
def main(args):
    with rasengan.debug_support():
        with rasengan.tictoc("Loading Data"):
            data_list = rasengan.namespacer(
                read_data(args.train_fn))
            val_data_list = rasengan.namespacer(
                read_data(args.dev_fn))
            if args.partition_dev_into_train > 0:
                lim = args.partition_dev_into_test
                data_list.extend(val_data_list[lim:])
                val_data_list = val_data_list[:lim]

            if args.partition_dev_into_test > 0:
                lim = args.partition_dev_into_test
                test_data_list = val_data_list[lim:]
                val_data_list = val_data_list[:lim]
            else:
                test_data_list = rasengan.namespacer(
                    read_data(args.test_fn))

            # data_list = val_data_list = [(u'jason', u'eisner')]
            lst_char = get_lst_char(data_list
                                    + val_data_list
                                    + test_data_list)
            data_list = add_bos(data_list)
            val_data_list = add_bos(val_data_list)
            test_data_list = add_bos(test_data_list)
            warnings.warn('''
            NOTE: While preparing sigma, we add 1 to the index
            returned by enumerate because the transducer unit that
            Ryan wrote uses index 0 as the index for the epsilon
            symbol. So essentially the epsilon symbol and the
            integer 0 are reserved symbols that cannot appear in the
            vocabulary.

            ALSO, we need to add 1 to the vocsize because of that.
            ''')
            # sigma :: char -> int
            sigma = dict((b, a+1) for (a,b) in enumerate(lst_char))

            # sigma_inv :: int -> char
            sigma_inv = dict((a+1, b) for (a,b) in enumerate(lst_char))

            if args.limit_corpus > 0:
                data_list = data_list[:args.limit_corpus]

            train_data = numerize(data_list, sigma, args.win)
            val_data = numerize(val_data_list, sigma, args.win)
            test_data = numerize(test_data_list, sigma, args.win)

            data = rasengan.Namespace()

            #-------------------------------------------------------------#
            # Add sets that would be used by the tensorflow seq2seq       #
            # model. See~$PY/tensorflow/models/rnn/translate/translate.py #
            #-------------------------------------------------------------#
            data.train_data = data_list
            data.val_data = val_data_list
            data.test_data = test_data_list

            data.train_set = train_data
            data.dev_set = val_data
            data.test_set = test_data

            data.vocsize = len(sigma) + 1
            data.idx2label = sigma_inv
            data.label2idx = sigma

            data.train_lex = [e[0] for e in train_data]
            data.train_y = [e[1] for e in train_data]

            data.valid_lex = [e[0] for e in val_data]
            data.valid_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in val_data], data.idx2label)

            data.test_lex = [e[0] for e in test_data]
            data.test_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in test_data], data.idx2label)

            data.words_train = []
            data.words_valid = []
            data.words_test = []
    return data
def main(args):
    with rasengan.debug_support():
        with rasengan.tictoc("Loading Data"):
            data_list = rasengan.namespacer(read_data(args.train_fn))
            val_data_list = rasengan.namespacer(read_data(args.dev_fn))
            if args.partition_dev_into_train > 0:
                lim = args.partition_dev_into_test
                data_list.extend(val_data_list[lim:])
                val_data_list = val_data_list[:lim]

            if args.partition_dev_into_test > 0:
                lim = args.partition_dev_into_test
                test_data_list = val_data_list[lim:]
                val_data_list = val_data_list[:lim]
            else:
                test_data_list = rasengan.namespacer(read_data(args.test_fn))

            # data_list = val_data_list = [(u'jason', u'eisner')]
            lst_char = get_lst_char(data_list + val_data_list + test_data_list)
            data_list = add_bos(data_list)
            val_data_list = add_bos(val_data_list)
            test_data_list = add_bos(test_data_list)
            warnings.warn('''
            NOTE: While preparing sigma, we add 1 to the index
            returned by enumerate because the transducer unit that
            Ryan wrote uses index 0 as the index for the epsilon
            symbol. So essentially the epsilon symbol and the
            integer 0 are reserved symbols that cannot appear in the
            vocabulary.

            ALSO, we need to add 1 to the vocsize because of that.
            ''')
            # sigma :: char -> int
            sigma = dict((b, a + 1) for (a, b) in enumerate(lst_char))

            # sigma_inv :: int -> char
            sigma_inv = dict((a + 1, b) for (a, b) in enumerate(lst_char))

            if args.limit_corpus > 0:
                data_list = data_list[:args.limit_corpus]

            train_data = numerize(data_list, sigma, args.win)
            val_data = numerize(val_data_list, sigma, args.win)
            test_data = numerize(test_data_list, sigma, args.win)

            data = rasengan.Namespace()

            #-------------------------------------------------------------#
            # Add sets that would be used by the tensorflow seq2seq       #
            # model. See~$PY/tensorflow/models/rnn/translate/translate.py #
            #-------------------------------------------------------------#
            data.train_data = data_list
            data.val_data = val_data_list
            data.test_data = test_data_list

            data.train_set = train_data
            data.dev_set = val_data
            data.test_set = test_data

            data.vocsize = len(sigma) + 1
            data.idx2label = sigma_inv
            data.label2idx = sigma

            data.train_lex = [e[0] for e in train_data]
            data.train_y = [e[1] for e in train_data]

            data.valid_lex = [e[0] for e in val_data]
            data.valid_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in val_data], data.idx2label)

            data.test_lex = [e[0] for e in test_data]
            data.test_y = util_lstm_seqlabel.convert_id_to_word(
                [e[1] for e in test_data], data.idx2label)

            data.words_train = []
            data.words_valid = []
            data.words_test = []
    return data
            #----------------------------------------------------------------------#
            # Print `keys` from pkl file that were specially mentioned on cmdline. #
            #----------------------------------------------------------------------#
            for k in args.keys:
                print k, pkl[k]
            #---------------------------------------------------------#
            # In case we want to interact with the pkl after loading. #
            #---------------------------------------------------------#
            if args.interact:
                import readline, code
                print pkl.keys()
                code.InteractiveConsole(pkl).interact()


if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(
    description='Tabulate performance of saved model files.')
    arg_parser.add_argument(
        '--path', nargs='*', default=[],
        help='A glob of the paths to the pkls')
    arg_parser.add_argument(
        '--interact', default=0, type=int,
        help='Default={0}')
    arg_parser.add_argument(
        '--keys', nargs='*', default=[],
        help='Default={0}')
    arg_parser.add_argument('--server', default=0, type=int, help='Default={0}')
    arg_parser.add_argument('--client', default=0, type=int, help='Default={0}')
    with rasengan.debug_support():
        main(args=arg_parser.parse_args())