def __init__(self, config, out_dir=None):
        super(DataValuation, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])
        self.dataset_id = self.config['dataset_config']['dataset_id']
        self.data_dir = self.config['dataset_config']['data_dir']
        self.train = self.datasets.train
        self.test = self.datasets.test
        self.validation = self.datasets.validation
        self.sample_weights = ds.loader.load_supplemental_info(self.dataset_id + '_weights',
                data_dir=self.data_dir)\
                if self.config['sample_weights'] else [np.ones(self.train.x.shape[0]),
                        np.ones(self.validation.x.shape[0]),
                        np.ones(self.test.x.shape[0])]
        self.nonfires = ds.loader.load_supplemental_info(
            self.dataset_id + '_nonfires', data_dir=self.data_dir)

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(self.train)
        model_config['arch']['fit_intercept'] = True

        # Heuristic for determining maximum batch evaluation sizes without OOM
        D = model_config['arch']['input_dim'] * model_config['arch'][
            'num_classes']
        model_config['grad_batch_size'] = max(1,
                                              self.config['max_memory'] // D)
        model_config['hessian_batch_size'] = max(
            1, self.config['max_memory'] // (D * D))

        self.model_dir = model_dir
        self.model_config = model_config

        # Convenience member variables
        self.num_train = self.train.num_examples
        self.num_classes = self.model_config['arch']['num_classes']
示例#2
0
 def get_model(self, dataset_id=None):
     if not hasattr(self, 'model'):
         dataset = self.get_dataset(dataset_id)
         model_config = LogisticRegression.default_config()
         model_config['arch'] = LogisticRegression.infer_arch(dataset.train)
         model_dir = os.path.join(self.base_dir, 'models')
         self.model = LogisticRegression(model_config, model_dir)
     return self.model
 def get_model(self):
     if not hasattr(self, 'model'):
         self.model = LogisticRegression(
             self.model_config,
             self.model_dir,
             random_state=np.random.RandomState(2))
     return self.model
示例#4
0
    def __init__(self, config, out_dir=None):
        super(TestLogreg, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(
            self.datasets.train)
        model_config['arch']['fit_intercept'] = self.config['fit_intercept']
        self.model_dir = model_dir
        self.model_config = model_config

        MAX_MEMORY = int(1e7)
        D = model_config['arch']['input_dim'] * model_config['arch'][
            'num_classes']
        self.eval_args = {
            'grad_batch_size': max(1, MAX_MEMORY // D),
            'hess_batch_size': max(1, MAX_MEMORY // (D * D)),
        }
    def __init__(self, config, out_dir=None):
        super(SubsetInfluenceLogreg, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])
        self.train = self.datasets.train
        self.test = self.datasets.test
        self.validation = self.datasets.validation

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(self.datasets.train)
        model_config['arch']['fit_intercept'] = True

        # Heuristic for determining maximum batch evaluation sizes without OOM
        D = model_config['arch']['input_dim'] * model_config['arch']['num_classes']
        model_config['grad_batch_size'] =  max(1, self.config['max_memory'] // D)
        model_config['hessian_batch_size'] = max(1, self.config['max_memory'] // (D * D))

        # Set the method for computing inverse HVP
        model_config['inverse_hvp_method'] = self.config['inverse_hvp_method']

        self.model_dir = model_dir
        self.model_config = model_config

        # Convenience member variables
        self.dataset_id = self.config['dataset_config']['dataset_id']
        self.num_train = self.datasets.train.num_examples
        self.num_classes = self.model_config['arch']['num_classes']
        self.num_subsets = self.config['num_subsets']
        if self.subset_choice_type == "types":
            self.subset_size = int(self.num_train * self.config['subset_rel_size'])
        elif self.subset_choice_type == "range":
            self.subset_min_size = int(self.num_train * self.config['subset_min_rel_size'])
            self.subset_max_size = int(self.num_train * self.config['subset_max_rel_size'])

        tasks_dir = os.path.join(self.base_dir, 'tasks')
        self.task_queue = TaskQueue(tasks_dir, master_only=self.config['master_only'])
        self.task_queue.define_task('retrain_subsets', self.retrain_subsets)
        self.task_queue.define_task('self_pred_infl', self.self_pred_infl)
        self.task_queue.define_task('newton_batch', self.newton_batch)
    def __init__(self, config, out_dir=None):
        super(CreditAssignment, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])
        self.dataset_id = self.config['dataset_config']['dataset_id']
        self.data_dir = self.config['dataset_config']['data_dir']

        self.train = self.datasets.train
        print("Shape of training set: {}".format(self.train.x.shape))
        self.test = self.datasets.test
        self.validation = self.datasets.validation
        self.sample_weights = ds.loader.load_supplemental_info(self.dataset_id + '_weights',
                data_dir=self.data_dir)\
                if self.config['sample_weights'] else [np.ones(self.train.x.shape[0]),
                        np.ones(self.validation.x.shape[0]),
                        np.ones(self.test.x.shape[0])]

        self.num_train = self.train.num_examples

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(self.train)
        model_config['arch']['fit_intercept'] = True

        # Heuristic for determining maximum batch evaluation sizes without OOM
        D = model_config['arch']['input_dim'] * model_config['arch'][
            'num_classes']
        if 'grad_batch_size' in self.config and self.config[
                'grad_batch_size'] is not None:
            model_config['grad_batch_size'] = self.config['grad_batch_size']
        else:
            model_config['grad_batch_size'] = max(
                1, self.config['max_memory'] // D)
        if 'hessian_batch_size' in self.config and self.config[
                'hessian_batch_size'] is not None:
            model_config['hessian_batch_size'] = self.config[
                'hessian_batch_size']
        else:
            model_config['hessian_batch_size'] = max(
                1, self.config['max_memory'] // (D * D))

        self.model_dir = model_dir
        self.model_config = model_config

        # Convenience member variables
        self.num_classes = self.model_config['arch']['num_classes']
        self.nonfires = ds.loader.load_supplemental_info(
            self.dataset_id + '_nonfires', data_dir=self.data_dir)

        def print_class_balance(ds, name):
            print("Dataset {}:".format(name))
            for i, val in enumerate(
                    np.bincount(ds.labels) / ds.labels.shape[0]):
                print("Class {} is {} of the dataset.".format(i, val))

        print_class_balance(self.train, 'train')
        print_class_balance(self.test, 'test')
        print_class_balance(self.nonfires, 'nonfires')

        self.task_queue = TaskQueue(os.path.join(self.base_dir, 'tasks'),\
                master_only=self.config['master_only'])
        self.task_queue.define_task('compute_all_and_fixed_test_and_nonfire_influence',\
                self.compute_all_and_fixed_test_and_nonfire_influence)
        self.task_queue.define_task('retrain_subsets', self.retrain_subsets)
        self.task_queue.define_task('self_pred_infl', self.self_pred_infl)
 def get_model(self):
     if not hasattr(self, 'model'):
         self.model = LogisticRegression(self.model_config, self.model_dir)
     return self.model