def __init__(self, config, out_dir=None): super(DataValuation, self).__init__(config, out_dir) self.datasets = ds.loader.load_dataset(**self.config['dataset_config']) self.dataset_id = self.config['dataset_config']['dataset_id'] self.data_dir = self.config['dataset_config']['data_dir'] self.train = self.datasets.train self.test = self.datasets.test self.validation = self.datasets.validation self.sample_weights = ds.loader.load_supplemental_info(self.dataset_id + '_weights', data_dir=self.data_dir)\ if self.config['sample_weights'] else [np.ones(self.train.x.shape[0]), np.ones(self.validation.x.shape[0]), np.ones(self.test.x.shape[0])] self.nonfires = ds.loader.load_supplemental_info( self.dataset_id + '_nonfires', data_dir=self.data_dir) model_dir = os.path.join(self.base_dir, 'models') model_config = LogisticRegression.default_config() model_config['arch'] = LogisticRegression.infer_arch(self.train) model_config['arch']['fit_intercept'] = True # Heuristic for determining maximum batch evaluation sizes without OOM D = model_config['arch']['input_dim'] * model_config['arch'][ 'num_classes'] model_config['grad_batch_size'] = max(1, self.config['max_memory'] // D) model_config['hessian_batch_size'] = max( 1, self.config['max_memory'] // (D * D)) self.model_dir = model_dir self.model_config = model_config # Convenience member variables self.num_train = self.train.num_examples self.num_classes = self.model_config['arch']['num_classes']
def get_model(self, dataset_id=None): if not hasattr(self, 'model'): dataset = self.get_dataset(dataset_id) model_config = LogisticRegression.default_config() model_config['arch'] = LogisticRegression.infer_arch(dataset.train) model_dir = os.path.join(self.base_dir, 'models') self.model = LogisticRegression(model_config, model_dir) return self.model
def get_model(self): if not hasattr(self, 'model'): self.model = LogisticRegression( self.model_config, self.model_dir, random_state=np.random.RandomState(2)) return self.model
def __init__(self, config, out_dir=None): super(TestLogreg, self).__init__(config, out_dir) self.datasets = ds.loader.load_dataset(**self.config['dataset_config']) model_dir = os.path.join(self.base_dir, 'models') model_config = LogisticRegression.default_config() model_config['arch'] = LogisticRegression.infer_arch( self.datasets.train) model_config['arch']['fit_intercept'] = self.config['fit_intercept'] self.model_dir = model_dir self.model_config = model_config MAX_MEMORY = int(1e7) D = model_config['arch']['input_dim'] * model_config['arch'][ 'num_classes'] self.eval_args = { 'grad_batch_size': max(1, MAX_MEMORY // D), 'hess_batch_size': max(1, MAX_MEMORY // (D * D)), }
def __init__(self, config, out_dir=None): super(SubsetInfluenceLogreg, self).__init__(config, out_dir) self.datasets = ds.loader.load_dataset(**self.config['dataset_config']) self.train = self.datasets.train self.test = self.datasets.test self.validation = self.datasets.validation model_dir = os.path.join(self.base_dir, 'models') model_config = LogisticRegression.default_config() model_config['arch'] = LogisticRegression.infer_arch(self.datasets.train) model_config['arch']['fit_intercept'] = True # Heuristic for determining maximum batch evaluation sizes without OOM D = model_config['arch']['input_dim'] * model_config['arch']['num_classes'] model_config['grad_batch_size'] = max(1, self.config['max_memory'] // D) model_config['hessian_batch_size'] = max(1, self.config['max_memory'] // (D * D)) # Set the method for computing inverse HVP model_config['inverse_hvp_method'] = self.config['inverse_hvp_method'] self.model_dir = model_dir self.model_config = model_config # Convenience member variables self.dataset_id = self.config['dataset_config']['dataset_id'] self.num_train = self.datasets.train.num_examples self.num_classes = self.model_config['arch']['num_classes'] self.num_subsets = self.config['num_subsets'] if self.subset_choice_type == "types": self.subset_size = int(self.num_train * self.config['subset_rel_size']) elif self.subset_choice_type == "range": self.subset_min_size = int(self.num_train * self.config['subset_min_rel_size']) self.subset_max_size = int(self.num_train * self.config['subset_max_rel_size']) tasks_dir = os.path.join(self.base_dir, 'tasks') self.task_queue = TaskQueue(tasks_dir, master_only=self.config['master_only']) self.task_queue.define_task('retrain_subsets', self.retrain_subsets) self.task_queue.define_task('self_pred_infl', self.self_pred_infl) self.task_queue.define_task('newton_batch', self.newton_batch)
def __init__(self, config, out_dir=None): super(CreditAssignment, self).__init__(config, out_dir) self.datasets = ds.loader.load_dataset(**self.config['dataset_config']) self.dataset_id = self.config['dataset_config']['dataset_id'] self.data_dir = self.config['dataset_config']['data_dir'] self.train = self.datasets.train print("Shape of training set: {}".format(self.train.x.shape)) self.test = self.datasets.test self.validation = self.datasets.validation self.sample_weights = ds.loader.load_supplemental_info(self.dataset_id + '_weights', data_dir=self.data_dir)\ if self.config['sample_weights'] else [np.ones(self.train.x.shape[0]), np.ones(self.validation.x.shape[0]), np.ones(self.test.x.shape[0])] self.num_train = self.train.num_examples model_dir = os.path.join(self.base_dir, 'models') model_config = LogisticRegression.default_config() model_config['arch'] = LogisticRegression.infer_arch(self.train) model_config['arch']['fit_intercept'] = True # Heuristic for determining maximum batch evaluation sizes without OOM D = model_config['arch']['input_dim'] * model_config['arch'][ 'num_classes'] if 'grad_batch_size' in self.config and self.config[ 'grad_batch_size'] is not None: model_config['grad_batch_size'] = self.config['grad_batch_size'] else: model_config['grad_batch_size'] = max( 1, self.config['max_memory'] // D) if 'hessian_batch_size' in self.config and self.config[ 'hessian_batch_size'] is not None: model_config['hessian_batch_size'] = self.config[ 'hessian_batch_size'] else: model_config['hessian_batch_size'] = max( 1, self.config['max_memory'] // (D * D)) self.model_dir = model_dir self.model_config = model_config # Convenience member variables self.num_classes = self.model_config['arch']['num_classes'] self.nonfires = ds.loader.load_supplemental_info( self.dataset_id + '_nonfires', data_dir=self.data_dir) def print_class_balance(ds, name): print("Dataset {}:".format(name)) for i, val in enumerate( np.bincount(ds.labels) / ds.labels.shape[0]): print("Class {} is {} of the dataset.".format(i, val)) print_class_balance(self.train, 'train') print_class_balance(self.test, 'test') print_class_balance(self.nonfires, 'nonfires') self.task_queue = TaskQueue(os.path.join(self.base_dir, 'tasks'),\ master_only=self.config['master_only']) self.task_queue.define_task('compute_all_and_fixed_test_and_nonfire_influence',\ self.compute_all_and_fixed_test_and_nonfire_influence) self.task_queue.define_task('retrain_subsets', self.retrain_subsets) self.task_queue.define_task('self_pred_infl', self.self_pred_infl)
def get_model(self): if not hasattr(self, 'model'): self.model = LogisticRegression(self.model_config, self.model_dir) return self.model