Пример #1
0
    def start(self):
        print self.get_name()
        trial = []
        self._setup_options(self.config)
        t0 = time()
        self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed,
                                          shuffle=True, percent=self.split, keep_subject=True)
        self.print_lap("Loaded", t0)
        # self.data = self.vectorize(self.data)
        cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split)
        t = 0
        for train_index, test_index in cv:
            # get the data of this cv iteration
            # train, test = exputil.sample_data(self.data, train_index, test_index)
            train, test = self._sample_data(self.data, train_index, test_index)
            self.print_lap("\nSampled", t0)
            # get the expert and student
            learner = exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'),
                                          vct=self.vct, sent_tk=self.sent_tokenizer, seed=(t * 10 + 10),  cost_model=self.cost_model)

            expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=len(train.data))

            expert.fit(train.data, y=train.target, vct=self.vct)

            # do active learning
            results = self.main_loop(learner, expert, self.budget, self.bootstrap_size, train, test)
            self.print_lap("\nTrial %s" % t, t0)

            # save the results
            trial.append(results)
            t += 1
        self.report_results(trial)
Пример #2
0
    def _setup_options(self, config_obj):

        # experiment related config
        config = cfgutil.get_section_options(config_obj, 'experiment')
        self.trials = config['trials']
        self.folds = config['folds']
        self.max_iteration = config['maxiter']
        self.step = config['stepsize']
        self.budget = config['budget']
        self.prefix = config['fileprefix']
        self.output = config['outputdir']
        self.seed = config['seed']
        # self.bootstrap_size = config['bootstrap']
        self.bootstrap_size, self.bootstrap_method = exputil.get_bootstrap(config)
        self.costfn = exputil.get_costfn(config['costfunction'])
        if 'cost_model' in config.keys():
            self.cost_model = config['cost_model']
            self.cost_base = config['cost_base']

        # data related config
        config = cfgutil.get_section_options(config_obj, 'data')
        self.split = config['split']
        self.data_cat = config['categories']
        self.limit = config['limit']
        self.data_path = config['path']

        #data related config
        config = cfgutil.get_section_options(config_obj, 'expert')
        args = {}
        if 'snip_size' in config:
            args.update({'snip_size':config['snip_size']})
        self.sent_tokenizer = exputil.get_tokenizer(config['sent_tokenizer'], **args)
Пример #3
0
    def _setup_options(self, config_obj):

        super(ExperimentJobs,self)._setup_options(config_obj)

        # experiment related config
        config = cfgutil.get_section_options(config_obj, 'experiment')
        self.validation_set = config['validation_set']
Пример #4
0
    def __init__(self, dataname, config, verbose=False, debug=False):
        super(Experiment, self).__init__()
        self.verbose = verbose
        self.debug = debug
        self.config = config

        self.dataname = dataname
        self.data_cat = None
        self.data = None
        self.data_path = None

        self.trials = None
        self.folds = None
        self.split = None
        self.costfn = None
        self.cost_model = None
        self.cost_base = 25
        self.budget = None
        self.max_iteration = None
        self.step = None
        self.bootstrap_size = None
        self.seed = None
        self.output = None

        self.rnd_state = np.random.RandomState(32564)
        self.remaining = None
        self.vct = exputil.get_vectorizer(cfgutil.get_section_options(config, 'data'))
        self.sent_tokenizer = None
Пример #5
0
    def start(self, n_jobs=1, pre_dispatch='2*n_jobs'):
        trial = []
        self._setup_options(self.config)
        print self.get_name()
        t0 = time()
        self.data = datautil.load_dataset(self.dataname, self.data_path, categories=self.data_cat, rnd=self.seed,
                                          shuffle=True, percent=self.split, keep_subject=True)
        self.print_lap("Loaded", t0)

        self.data = self.vectorize(self.data)

        cv = self.cross_validation_data(self.data, folds=self.folds, trials=self.trials, split=self.split)

        seeds = np.arange(len(cv)) * 10 + 10

        expert = exputil.get_expert(cfgutil.get_section_options(self.config, 'expert'), size=(len(self.data.train.target),self.data.train.sizes.max()))

        expert.fit(self.data.train.bow, y=self.data.train.target, vct=self.vct)

        lrnr_setup= {'vct':self.vct, "sent_tk":self.sent_tokenizer,  "cost_model":self.cost_model,
                     'validation_set':self.validation_set}

        lrnr_type = cfgutil.get_section_option(self.config, 'learner', 'type')

        neu_threshold = cfgutil.get_section_option(self.config, 'expert', 'threshold')

        if lrnr_type in ['utility-cheat','const-cheat','const-cheat-noisy']:
            lrnr_setup.update({'snip_model':expert.oracle, 'threshold':neu_threshold})

        learners = [exputil.get_learner(cfgutil.get_section_options(self.config, 'learner'),
                                        seed=s, **lrnr_setup) for s in seeds]
        self.print_lap("\nPreprocessed", t0)
        # ===================================
        parallel = Parallel(n_jobs=n_jobs, verbose=True,
                            pre_dispatch=pre_dispatch)
        scores = parallel(delayed(self.main_loop_jobs,check_pickle=False)(learners[t], expert, self.budget, self.bootstrap_size,
                                                  self.data, tr[0],tr[1], t)
                         for t, tr in enumerate(cv))
        # ===================================

        self.print_lap("\nDone trials", t0)

        # save the results

        self.report_results(scores)
Пример #6
0
    def set_options(self, config_obj):
        self.rnd_state = np.random.RandomState(32564)

        config = cfgutil.get_section_options(config_obj, 'data')
        self.data_cat = config['categories']
        self.data_path = config['path']
        self.split = config['split']
        self.vct = exputil.get_vectorizer(config)

        config = cfgutil.get_section_options(config_obj, 'expert')
        self.sent_tokenizer = exputil.get_tokenizer(config['sent_tokenizer'])

        config = cfgutil.get_section_options(config_obj, 'experiment')
        self.seed = config['seed']
        self.budget = config['budget']
        self.step = config['stepsize']
        self.output = config['outputdir']
        self.bootstrap_size = config['bootstrap']
Пример #7
0
    def get_expert(self, config, target_names):
        ''' Get human expert
        :return:
        '''
        type_exp = cfgutil.get_section_options(config, 'expert')
        if type_exp['type'] == 'human':
            from expert.human_expert import HumanExpert

            names = ", ".join(["{}={}".format(a, b) for a, b in enumerate(target_names + ['neutral'])]) + " ? > "
            expert = HumanExpert(None, names)
        else:
            raise Exception("Oops, cannot handle an %s expert" % type_exp)

        return expert
Пример #8
0
    def get_student(self, config, pool, sequence):
        from collections import deque
        l1 = cfgutil.get_section_options(config, 'learner1')

        pool[0].remaining = deque(sequence)
        student1 = exputil.get_learner(l1, vct=self.vct, sent_tk=self.sent_tokenizer, seed=self.seed)

        self.learner1 = bunch.Bunch(student=student1, name="{}-{}".format(l1['utility'], l1['snippet']),
                                    pool=pool[0], train=[], budget=0, sequence=sequence)

        l1 = cfgutil.get_section_options(config, 'learner2')

        student2 = exputil.get_learner(l1, vct=self.vct, sent_tk=self.sent_tokenizer, seed=self.seed)

        ## reshuffle the sequence
        rnd2 = np.random.RandomState(9187465)
        sequence2 = [s for s in sequence]
        rnd2.shuffle(sequence2)

        # udpade the pool
        pool[1].remaining = deque(sequence2)
        self.learner2 = bunch.Bunch(student=student2, name="{}-{}".format(l1['utility'], l1['snippet']),
                                    pool=pool[1], train=[], budget=0, sequence=sequence2)
        return self.learner1, self.learner2
Пример #9
0
 def get_name(self):
     cfg = cfgutil.get_section_options(self.config, 'learner')
     post = cfgutil.get_section_option(self.config, 'experiment', 'fileprefix')
     name = "data-{}-lrn-{}-ut-{}-snip-{}-cal-{}{}".format(self.dataname, cfg['type'], cfg['utility'],
                                                           cfg['snippet'], cfg['calibration'], post)
     return name
Пример #10
0
 def get_name(self):
     cfg = cfgutil.get_section_options(self.config, 'learner')
     post = cfgutil.get_section_option(self.config, 'experiment', 'fileprefix')
     name = "data-{}-lrn-{}-ut-{}-{}".format(self.dataname, cfg['type'], cfg['loss_function'],
                                                            post)
     return name