def __init__(self, root_dir='data', download=False, split_scheme='official'): # set variables self._dataset_name = 'yelp' self._version = '1.0' if split_scheme=='official': split_scheme = 'time' self._split_scheme = split_scheme self._y_type = 'long' self._y_size = 1 self._n_classes = 5 # path self._data_dir = self.initialize_data_dir(root_dir, download) # Load data data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'), dtype={'review_id': str, 'user_id':str, 'business_id':str, 'stars':int, 'useful':int, 'funny':int, 'cool':int, 'text':str, 'date':str, 'year':int, 'city':str, 'state':str, 'categories':str}, keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC) split_df = pd.read_csv(os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv')) is_in_dataset = split_df['split']!=NOT_IN_DATASET split_df = split_df[is_in_dataset] data_df = data_df[is_in_dataset] # Get arrays self._split_array = split_df['split'].values self._input_array = list(data_df['text']) # Get metadata self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata(data_df, self.split_array) # Get y from metadata self._y_array = getattr(self.metadata_array[:,self.metadata_fields.index('y')], self._y_type)() # Set split info self.initialize_split_dicts() # eval self.initialize_eval_grouper() self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)
def __init__(self, root_dir='data', download=False, split_scheme='official'): # set variables self._dataset_name = 'amazon' self._version = '1.0' self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x60237058e01749cda7b0701c2bd01420/contents/blob/' self._compressed_size = 4_066_541_568 # the official split is the user split if split_scheme == 'official': split_scheme = 'user' self._split_scheme = split_scheme self._y_type = 'long' self._y_size = 1 self._n_classes = 5 # path self._data_dir = self.initialize_data_dir(root_dir, download) # Load data data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'), dtype={ 'reviewerID': str, 'asin': str, 'reviewTime': str, 'unixReviewTime': int, 'reviewText': str, 'summary': str, 'verified': bool, 'category': str, 'reviewYear': int }, keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC) split_df = pd.read_csv( os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv')) is_in_dataset = split_df['split'] != NOT_IN_DATASET split_df = split_df[is_in_dataset] data_df = data_df[is_in_dataset] # Get arrays self._split_array = split_df['split'].values self._input_array = list(data_df['reviewText']) # Get metadata self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata( data_df, self.split_array) # Get y from metadata self._y_array = getattr( self.metadata_array[:, self.metadata_fields.index('y')], self._y_type)() # Set split info self.initialize_split_dicts() # eval self.initialize_eval_grouper() self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)
def eval(self, y_pred, y_true, metadata, prediction_fn=None): """ Computes all evaluation metrics. Args: - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor). But they can also be other model outputs such that prediction_fn(y_pred) are predicted labels. - y_true (LongTensor): Ground-truth labels - metadata (Tensor): Metadata - prediction_fn (function): A function that turns y_pred into predicted labels Output: - results (dictionary): Dictionary of evaluation metrics - results_str (str): String summarizing the evaluation metrics """ metrics = [ Accuracy(prediction_fn=prediction_fn), Recall(prediction_fn=prediction_fn, average='macro'), F1(prediction_fn=prediction_fn, average='macro'), ] results = {} for i in range(len(metrics)): results.update({ **metrics[i].compute(y_pred, y_true), }) results_str = ( f"Average acc: {results[metrics[0].agg_metric_field]:.3f}\n" f"Recall macro: {results[metrics[1].agg_metric_field]:.3f}\n" f"F1 macro: {results[metrics[2].agg_metric_field]:.3f}\n") return results, results_str
def eval(self, y_pred, y_true, metadata, prediction_fn=None): """ Computes all evaluation metrics. Args: - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor). But they can also be other model outputs such that prediction_fn(y_pred) are predicted labels. - y_true (LongTensor): Ground-truth labels - metadata (Tensor): Metadata - prediction_fn (function): A function that turns y_pred into predicted labels Output: - results (dictionary): Dictionary of evaluation metrics - results_str (str): String summarizing the evaluation metrics """ metric = Accuracy(prediction_fn=prediction_fn) results = { **metric.compute(y_pred, y_true), } results_str = f"Average {metric.name}: {results[metric.agg_metric_field]:.3f}\n" # Each eval_grouper is over label + a single identity # We only want to keep the groups where the identity is positive # The groups are: # Group 0: identity = 0, y = 0 # Group 1: identity = 1, y = 0 # Group 2: identity = 0, y = 1 # Group 3: identity = 1, y = 1 # so this means we want only groups 1 and 3. worst_group_metric = None for identity_var, eval_grouper in zip(self._identity_vars, self._eval_groupers): g = eval_grouper.metadata_to_group(metadata) group_results = { **metric.compute_group_wise(y_pred, y_true, g, eval_grouper.n_groups) } results_str += f" {identity_var:20s}" for group_idx in range(eval_grouper.n_groups): group_str = eval_grouper.group_field_str(group_idx) if f'{identity_var}:1' in group_str: group_metric = group_results[metric.group_metric_field(group_idx)] group_counts = group_results[metric.group_count_field(group_idx)] results[f'{metric.name}_{group_str}'] = group_metric results[f'count_{group_str}'] = group_counts if f'y:0' in group_str: label_str = 'non_toxic' else: label_str = 'toxic' results_str += ( f" {metric.name} on {label_str}: {group_metric:.3f}" f" (n = {results[f'count_{group_str}']:6.0f}) " ) if worst_group_metric is None: worst_group_metric = group_metric else: worst_group_metric = metric.worst( [worst_group_metric, group_metric]) results_str += f"\n" results[f'{metric.worst_group_metric_field}'] = worst_group_metric results_str += f"Worst-group {metric.name}: {worst_group_metric:.3f}\n" return results, results_str
def eval(self, y_pred, y_true, metadata, prediction_fn=multiclass_logits_to_pred, score_fn=binary_logits_to_score): """ Computes all evaluation metrics. Args: - y_pred (Tensor): Predictions from a model. By default, they are multi-class logits (FloatTensor). But they can also be other model outputs such that prediction_fn(y_pred) are predicted labels and score_fn(y_pred) are confidence scores. - y_true (LongTensor): Ground-truth labels - metadata (Tensor): Metadata - prediction_fn (function): A function that turns y_pred into predicted labels Output: - results (dictionary): Dictionary of evaluation metrics - results_str (str): String summarizing the evaluation metrics """ """Evaluate the precision achieved overall and across groups for a given global recall""" g = self._eval_grouper.metadata_to_group(metadata) y_scores = score_fn(y_pred) threshold_60 = threshold_at_recall(y_scores, y_true, global_recall=60) accuracy_metric = Accuracy(prediction_fn=prediction_fn) PAR_metric = PrecisionAtRecall(threshold_60, score_fn=score_fn) results = accuracy_metric.compute(y_pred, y_true) results.update(PAR_metric.compute(y_pred, y_true)) results.update( accuracy_metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)) results.update( PAR_metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)) results_str = ( f"Average {PAR_metric.name}: {results[PAR_metric.agg_metric_field]:.3f}\n" f"Average {accuracy_metric.name}: {results[accuracy_metric.agg_metric_field]:.3f}\n" ) return results, results_str
def eval(self, y_pred, y_true, metadata, prediction_fn=None): """ Computes all evaluation metrics. Args: - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor). But they can also be other model outputs such that prediction_fn(y_pred) are predicted labels. - y_true (LongTensor): Ground-truth labels - metadata (Tensor): Metadata - prediction_fn (function): A function that turns y_pred into predicted labels Output: - results (dictionary): Dictionary of evaluation metrics - results_str (str): String summarizing the evaluation metrics """ metric = Accuracy(prediction_fn=prediction_fn) # Overall evaluation + evaluate by year all_results, all_results_str = self.standard_group_eval( metric, self._eval_groupers['year'], y_pred, y_true, metadata) # Evaluate by region and ignore the "Other" region region_grouper = self._eval_groupers['region'] region_results = metric.compute_group_wise( y_pred, y_true, region_grouper.metadata_to_group(metadata), region_grouper.n_groups) all_results[f'{metric.name}_worst_year'] = all_results.pop( metric.worst_group_metric_field) region_metric_list = [] for group_idx in range(region_grouper.n_groups): group_str = region_grouper.group_field_str(group_idx) group_metric = region_results[metric.group_metric_field(group_idx)] group_counts = region_results[metric.group_count_field(group_idx)] all_results[f'{metric.name}_{group_str}'] = group_metric all_results[f'count_{group_str}'] = group_counts if region_results[metric.group_count_field( group_idx)] == 0 or "Other" in group_str: continue all_results_str += ( f' {region_grouper.group_str(group_idx)} ' f"[n = {region_results[metric.group_count_field(group_idx)]:6.0f}]:\t" f"{metric.name} = {region_results[metric.group_metric_field(group_idx)]:5.3f}\n" ) region_metric_list.append( region_results[metric.group_metric_field(group_idx)]) all_results[f'{metric.name}_worst_region'] = metric.worst( region_metric_list) all_results_str += f"Worst-group {metric.name}: {all_results[f'{metric.name}_worst_region']:.3f}\n" return all_results, all_results_str
def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'waterbirds' self._version = '1.0' self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x505056d5cdea4e4eaa0e242cbfe2daa4/contents/blob/' self._data_dir = self.initialize_data_dir(root_dir, download) if not os.path.exists(self.data_dir): raise ValueError( f'{self.data_dir} does not exist yet. Please generate the dataset first.' ) # Read in metadata # Note: metadata_df is one-indexed. metadata_df = pd.read_csv(os.path.join(self.data_dir, 'metadata.csv')) # Get the y values self._y_array = torch.LongTensor(metadata_df['y'].values) self._y_size = 1 self._n_classes = 2 self._metadata_array = torch.stack( (torch.LongTensor(metadata_df['place'].values), self._y_array), dim=1) self._metadata_fields = ['background', 'y'] self._metadata_map = { 'background': [' land', 'water'], # Padding for str formatting 'y': [' landbird', 'waterbird'] } # Extract filenames self._input_array = metadata_df['img_filename'].values self._original_resolution = (224, 224) # Extract splits self._split_scheme = split_scheme if self._split_scheme != 'official': raise ValueError( f'Split scheme {self._split_scheme} not recognized') self._split_array = metadata_df['split'].values self._eval_grouper = CombinatorialGrouper( dataset=self, groupby_fields=(['background', 'y'])) self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)
def eval(self, y_pred, y_true, metadata, prediction_fn=None): """ Computes all evaluation metrics. Args: - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor). But they can also be other model outputs such that prediction_fn(y_pred) are predicted labels. - y_true (LongTensor): Ground-truth labels - metadata (Tensor): Metadata - prediction_fn (function): A function that turns y_pred into predicted labels Output: - results (dictionary): Dictionary of evaluation metrics - results_str (str): String summarizing the evaluation metrics """ metric = Accuracy(prediction_fn=prediction_fn) return self.standard_group_eval(metric, self._eval_grouper, y_pred, y_true, metadata)
def eval(self, y_pred, y_true, metadata, prediction_fn=None): """ Computes all evaluation metrics. Args: - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor). But they can also be other model outputs such that prediction_fn(y_pred) are predicted labels. - y_true (LongTensor): Ground-truth labels - metadata (Tensor): Metadata - prediction_fn (function): A function that turns y_pred into predicted labels Output: - results (dictionary): Dictionary of evaluation metrics - results_str (str): String summarizing the evaluation metrics """ metric = Accuracy(prediction_fn=prediction_fn) if self.split_scheme=='user': # first compute groupwise accuracies g = self._eval_grouper.metadata_to_group(metadata) results = { **metric.compute(y_pred, y_true), **metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups) } accs = [] for group_idx in range(self._eval_grouper.n_groups): group_str = self._eval_grouper.group_field_str(group_idx) group_metric = results.pop(metric.group_metric_field(group_idx)) group_counts = results.pop(metric.group_count_field(group_idx)) results[f'{metric.name}_{group_str}'] = group_metric results[f'count_{group_str}'] = group_counts if group_counts>0: accs.append(group_metric) accs = np.array(accs) results['10th_percentile_acc'] = np.percentile(accs, 10) results[f'{metric.worst_group_metric_field}'] = metric.worst(accs) results_str = ( f"Average {metric.name}: {results[metric.agg_metric_field]:.3f}\n" f"10th percentile {metric.name}: {results['10th_percentile_acc']:.3f}\n" f"Worst-group {metric.name}: {results[metric.worst_group_metric_field]:.3f}\n" ) return results, results_str else: return self.standard_group_eval( metric, self._eval_grouper, y_pred, y_true, metadata)
from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE, multiclass_logits_to_pred, binary_logits_to_pred losses = { 'cross_entropy': ElementwiseLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')), 'lm_cross_entropy': MultiTaskLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')), 'mse': MSE(name='loss'), 'multitask_bce': MultiTaskLoss(loss_fn=nn.BCEWithLogitsLoss(reduction='none')), } algo_log_metrics = { 'accuracy': Accuracy(prediction_fn=multiclass_logits_to_pred), 'mse': MSE(), 'multitask_accuracy': MultiTaskAccuracy(prediction_fn=multiclass_logits_to_pred), 'multitask_binary_accuracy': MultiTaskAccuracy(prediction_fn=binary_logits_to_pred), None: None, } process_outputs_functions = { 'binary_logits_to_pred': binary_logits_to_pred, 'multiclass_logits_to_pred': multiclass_logits_to_pred, None: None, }
class YelpDataset(WILDSDataset): """ Yelp dataset. This is a modified version of the Yelp Open Dataset This dataset is not part of the official WILDS benchmark. We provide it for convenience and to reproduce observations discussed in the WILDS paper. Supported `split_scheme`: 'official': official split, which is equivalent to 'time' 'time': shifts from reviews written before 2013 to reviews written after 2013 'user': shifts to unseen reviewers 'time_baseline': oracle baseline splits for time shifts Input (x): Review text of maximum token length of 512. Label (y): y is the star rating (0,1,2,3,4 corresponding to 1-5 stars) Metadata: user: reviewer ID year: year in which the review was written business: business ID city: city of the business state: state of the business Website: https://www.yelp.com/dataset License: Because of the Dataset License provided by Yelp, we are unable to redistribute the data. Please download the data through the website (https://www.yelp.com/dataset/download) by agreeing to the Dataset License. """ def __init__(self, root_dir='data', download=False, split_scheme='official'): # set variables self._dataset_name = 'yelp' self._version = '1.0' if split_scheme == 'official': split_scheme = 'time' self._split_scheme = split_scheme self._y_type = 'long' self._y_size = 1 self._n_classes = 5 # path self._data_dir = self.initialize_data_dir(root_dir, download) # Load data data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'), dtype={ 'review_id': str, 'user_id': str, 'business_id': str, 'stars': int, 'useful': int, 'funny': int, 'cool': int, 'text': str, 'date': str, 'year': int, 'city': str, 'state': str, 'categories': str }, keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC) split_df = pd.read_csv( os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv')) is_in_dataset = split_df['split'] != NOT_IN_DATASET split_df = split_df[is_in_dataset] data_df = data_df[is_in_dataset] # Get arrays self._split_array = split_df['split'].values self._input_array = list(data_df['text']) # Get metadata self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata( data_df, self.split_array) # Get y from metadata self._y_array = getattr( self.metadata_array[:, self.metadata_fields.index('y')], self._y_type)() # Set split info self.initialize_split_dicts() # eval self.initialize_eval_grouper() self._metric = Accuracy() super().__init__(root_dir, download, split_scheme) def get_input(self, idx): return self._input_array[idx] def eval(self, y_pred, y_true, metadata): if self.split_scheme == 'user': # first compute groupwise accuracies g = self._eval_grouper.metadata_to_group(metadata) results = { **self._metric.compute(y_pred, y_true), **self._metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups) } accs = [] for group_idx in range(self._eval_grouper.n_groups): group_str = self._eval_grouper.group_field_str(group_idx) group_metric = results.pop( self._metric.group_metric_field(group_idx)) group_counts = results.pop( self._metric.group_count_field(group_idx)) results[f'{self._metric.name}_{group_str}'] = group_metric results[f'count_{group_str}'] = group_counts if group_counts > 0: accs.append(group_metric) accs = np.array(accs) results['10th_percentile_acc'] = np.percentile(accs, 10) results[ f'{self._metric.worst_group_metric_field}'] = self._metric.worst( accs) results_str = ( f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n" f"10th percentile {self._metric.name}: {results['10th_percentile_acc']:.3f}\n" f"Worst-group {self._metric.name}: {results[self._metric.worst_group_metric_field]:.3f}\n" ) return results, results_str else: return self.standard_group_eval(self._metric, self._eval_grouper, y_pred, y_true, metadata) def initialize_split_dicts(self): if self.split_scheme in ('user', 'time'): self._split_dict = { 'train': 0, 'val': 1, 'id_val': 2, 'test': 3, 'id_test': 4 } self._split_names = { 'train': 'Train', 'val': 'Validation (OOD)', 'id_val': 'Validation (ID)', 'test': 'Test (OOD)', 'id_test': 'Test (ID)' } elif self.split_scheme in ('time_baseline', ): # use defaults pass else: raise ValueError( f'Split scheme {self.split_scheme} not recognized') def load_metadata(self, data_df, split_array): # Get metadata columns = [ 'user_id', 'business_id', 'year', 'city', 'state', 'stars', ] metadata_fields = ['user', 'business', 'year', 'city', 'state', 'y'] metadata_df = data_df[columns].copy() metadata_df.columns = metadata_fields sort_idx = np.argsort(split_array) ordered_maps = {} for field in ['user', 'business', 'city', 'state']: # map to IDs in the order of split values ordered_maps[field] = pd.unique(metadata_df.iloc[sort_idx][field]) ordered_maps['y'] = range(1, 6) ordered_maps['year'] = range(metadata_df['year'].min(), metadata_df['year'].max() + 1) metadata_map, metadata = map_to_id_array(metadata_df, ordered_maps) return metadata_fields, torch.from_numpy( metadata.astype('long')), metadata_map def initialize_eval_grouper(self): if self.split_scheme == 'user': self._eval_grouper = CombinatorialGrouper(dataset=self, groupby_fields=['user']) elif self.split_scheme in ('time', 'time_baseline'): self._eval_grouper = CombinatorialGrouper(dataset=self, groupby_fields=['year']) else: raise ValueError( f'Split scheme {self.split_scheme} not recognized')
def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'celebA' self._version = '1.0' self._download_url = '' self._data_dir = self.initialize_data_dir(root_dir, download) target_name = 'Blond_Hair' confounder_names = ['Male'] # Read in attributes attrs_df = pd.read_csv( os.path.join(self.data_dir, 'list_attr_celeba.csv')) # Split out filenames and attribute names # Note: idx and filenames are off by one. self._input_array = attrs_df['image_id'].values self._original_resolution = (178, 218) attrs_df = attrs_df.drop(labels='image_id', axis='columns') attr_names = attrs_df.columns.copy() def attr_idx(attr_name): return attr_names.get_loc(attr_name) # Then cast attributes to numpy array and set them to 0 and 1 # (originally, they're -1 and 1) attrs_df = attrs_df.values attrs_df[attrs_df == -1] = 0 # Get the y values target_idx = attr_idx(target_name) self._y_array = torch.LongTensor(attrs_df[:, target_idx]) self._y_size = 1 self._n_classes = 2 # Get metadata confounder_idx = [attr_idx(a) for a in confounder_names] confounders = attrs_df[:, confounder_idx] self._metadata_array = torch.cat( (torch.LongTensor(confounders), self._y_array.reshape((-1, 1))), dim=1) confounder_names = [s.lower() for s in confounder_names] self._metadata_fields = confounder_names + ['y'] self._metadata_map = { 'y': ['not blond', ' blond'] # Padding for str formatting } self._eval_grouper = CombinatorialGrouper( dataset=self, groupby_fields=(confounder_names + ['y'])) self._metric = Accuracy() # Extract splits self._split_scheme = split_scheme if self._split_scheme != 'official': raise ValueError( f'Split scheme {self._split_scheme} not recognized') split_df = pd.read_csv( os.path.join(self.data_dir, 'list_eval_partition.csv')) self._split_array = split_df['partition'].values super().__init__(root_dir, download, split_scheme)
class FMoWDataset(WILDSDataset): """ The Functional Map of the World land use / building classification dataset. This is a processed version of the Functional Map of the World dataset originally sourced from https://github.com/fMoW/dataset. Support `split_scheme` 'official': official split, which is equivalent to 'time_after_2016' `time_after_{YEAR}` for YEAR between 2002--2018 Input (x): 224 x 224 x 3 RGB satellite image. Label (y): y is one of 62 land use / building classes Metadata: each image is annotated with a location coordinate, timestamp, country code. This dataset computes region as a derivative of country code. Website: https://github.com/fMoW/dataset Original publication: @inproceedings{fmow2018, title={Functional Map of the World}, author={Christie, Gordon and Fendley, Neil and Wilson, James and Mukherjee, Ryan}, booktitle={CVPR}, year={2018} } License: Distributed under the FMoW Challenge Public License. https://github.com/fMoW/dataset/blob/master/LICENSE """ _dataset_name = 'fmow' _download_url = 'https://worksheets.codalab.org/rest/bundles/0xc59ea8261dfe4d2baa3820866e33d781/contents/blob/' _version = '1.0' def __init__(self, root_dir='data', download=False, split_scheme='official', oracle_training_set=False, seed=111, use_ood_val=False): self._compressed_size = 70_000_000_000 self._data_dir = self.initialize_data_dir(root_dir, download) self._split_dict = { 'train': 0, 'id_val': 1, 'id_test': 2, 'val': 3, 'test': 4 } self._split_names = { 'train': 'Train', 'id_val': 'ID Val', 'id_test': 'ID Test', 'val': 'OOD Val', 'test': 'OOD Test' } if split_scheme == 'official': split_scheme = 'time_after_2016' self._split_scheme = split_scheme self.oracle_training_set = oracle_training_set self.root = Path(self._data_dir) self.seed = int(seed) self._original_resolution = (224, 224) self.category_to_idx = {cat: i for i, cat in enumerate(categories)} self.metadata = pd.read_csv(self.root / 'rgb_metadata.csv') country_codes_df = pd.read_csv(self.root / 'country_code_mapping.csv') countrycode_to_region = { k: v for k, v in zip(country_codes_df['alpha-3'], country_codes_df['region']) } regions = [ countrycode_to_region.get(code, 'Other') for code in self.metadata['country_code'].to_list() ] self.metadata['region'] = regions all_countries = self.metadata['country_code'] self.num_chunks = 101 self.chunk_size = len(self.metadata) // (self.num_chunks - 1) if self._split_scheme.startswith('time_after'): year = int(self._split_scheme.split('_')[2]) year_dt = datetime.datetime(year, 1, 1, tzinfo=pytz.UTC) self.test_ood_mask = np.asarray( pd.to_datetime(self.metadata['timestamp']) >= year_dt) # use 3 years of the training set as validation year_minus_3_dt = datetime.datetime(year - 3, 1, 1, tzinfo=pytz.UTC) self.val_ood_mask = np.asarray( pd.to_datetime(self.metadata['timestamp']) >= year_minus_3_dt ) & ~self.test_ood_mask self.ood_mask = self.test_ood_mask | self.val_ood_mask else: raise ValueError( f"Not supported: self._split_scheme = {self._split_scheme}") self._split_array = -1 * np.ones(len(self.metadata)) for split in self._split_dict.keys(): idxs = np.arange(len(self.metadata)) if split == 'test': test_mask = np.asarray(self.metadata['split'] == 'test') idxs = idxs[self.test_ood_mask & test_mask] elif split == 'val': val_mask = np.asarray(self.metadata['split'] == 'val') idxs = idxs[self.val_ood_mask & val_mask] elif split == 'id_test': test_mask = np.asarray(self.metadata['split'] == 'test') idxs = idxs[~self.ood_mask & test_mask] elif split == 'id_val': val_mask = np.asarray(self.metadata['split'] == 'val') idxs = idxs[~self.ood_mask & val_mask] else: split_mask = np.asarray(self.metadata['split'] == split) idxs = idxs[~self.ood_mask & split_mask] if self.oracle_training_set and split == 'train': test_mask = np.asarray(self.metadata['split'] == 'test') unused_ood_idxs = np.arange(len(self.metadata))[self.ood_mask & ~test_mask] subsample_unused_ood_idxs = subsample_idxs(unused_ood_idxs, num=len(idxs) // 2, seed=self.seed + 2) subsample_train_idxs = subsample_idxs(idxs.copy(), num=len(idxs) // 2, seed=self.seed + 3) idxs = np.concatenate( [subsample_unused_ood_idxs, subsample_train_idxs]) self._split_array[idxs] = self._split_dict[split] if not use_ood_val: self._split_dict = { 'train': 0, 'val': 1, 'id_test': 2, 'ood_val': 3, 'test': 4 } self._split_names = { 'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test' } # filter out sequestered images from full dataset seq_mask = np.asarray(self.metadata['split'] == 'seq') # take out the sequestered images self._split_array = self._split_array[~seq_mask] self.full_idxs = np.arange(len(self.metadata))[~seq_mask] self._y_array = np.asarray( [self.category_to_idx[y] for y in list(self.metadata['category'])]) self.metadata['y'] = self._y_array self._y_array = torch.from_numpy(self._y_array).long()[~seq_mask] self._y_size = 1 self._n_classes = 62 # convert region to idxs all_regions = list(self.metadata['region'].unique()) region_to_region_idx = { region: i for i, region in enumerate(all_regions) } self._metadata_map = {'region': all_regions} region_idxs = [ region_to_region_idx[region] for region in self.metadata['region'].tolist() ] self.metadata['region'] = region_idxs # make a year column in metadata year_array = -1 * np.ones(len(self.metadata)) ts = pd.to_datetime(self.metadata['timestamp']) for year in range(2002, 2018): year_mask = np.asarray(ts >= datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)) \ & np.asarray(ts < datetime.datetime(year+1, 1, 1, tzinfo=pytz.UTC)) year_array[year_mask] = year - 2002 self.metadata['year'] = year_array self._metadata_map['year'] = list(range(2002, 2018)) self._metadata_fields = ['region', 'year', 'y'] self._metadata_array = torch.from_numpy(self.metadata[ self._metadata_fields].astype(int).to_numpy()).long()[~seq_mask] self._eval_groupers = { 'year': CombinatorialGrouper(dataset=self, groupby_fields=['year']), 'region': CombinatorialGrouper(dataset=self, groupby_fields=['region']), } self._metric = Accuracy() super().__init__(root_dir, download, split_scheme) def get_input(self, idx): """ Returns x for a given idx. """ idx = self.full_idxs[idx] batch_idx = idx // self.chunk_size within_batch_idx = idx % self.chunk_size img_batch = np.load(self.root / f'rgb_all_imgs_{batch_idx}.npy', mmap_mode='r') return img_batch[within_batch_idx] def eval(self, y_pred, y_true, metadata): # Overall evaluation + evaluate by year all_results, all_results_str = self.standard_group_eval( self._metric, self._eval_groupers['year'], y_pred, y_true, metadata) # Evaluate by region and ignore the "Other" region region_grouper = self._eval_groupers['region'] region_results = self._metric.compute_group_wise( y_pred, y_true, region_grouper.metadata_to_group(metadata), region_grouper.n_groups) all_results[f'{self._metric.name}_worst_year'] = all_results.pop( self._metric.worst_group_metric_field) region_metric_list = [] for group_idx in range(region_grouper.n_groups): group_str = region_grouper.group_field_str(group_idx) group_metric = region_results[self._metric.group_metric_field( group_idx)] group_counts = region_results[self._metric.group_count_field( group_idx)] all_results[f'{self._metric.name}_{group_str}'] = group_metric all_results[f'count_{group_str}'] = group_counts if region_results[self._metric.group_count_field( group_idx)] == 0 or "Other" in group_str: continue all_results_str += ( f' {region_grouper.group_str(group_idx)} ' f"[n = {region_results[self._metric.group_count_field(group_idx)]:6.0f}]:\t" f"{self._metric.name} = {region_results[self._metric.group_metric_field(group_idx)]:5.3f}\n" ) region_metric_list.append( region_results[self._metric.group_metric_field(group_idx)]) all_results[f'{self._metric.name}_worst_region'] = self._metric.worst( region_metric_list) all_results_str += f"Worst-group {self._metric.name}: {all_results[f'{self._metric.name}_worst_region']:.3f}\n" return all_results, all_results_str
def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'camelyon17' self._version = '1.0' self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xe45e15f39fb54e9d9e919556af67aabe/contents/blob/' self._compressed_size = 10_658_709_504 self._data_dir = self.initialize_data_dir(root_dir, download) self._original_resolution = (96, 96) # Read in metadata self._metadata_df = pd.read_csv(os.path.join(self._data_dir, 'metadata.csv'), index_col=0, dtype={'patient': 'str'}) # Get the y values self._y_array = torch.LongTensor(self._metadata_df['tumor'].values) self._y_size = 1 self._n_classes = 2 # Get filenames self._input_array = [ f'patches/patient_{patient}_node_{node}/patch_patient_{patient}_node_{node}_x_{x}_y_{y}.png' for patient, node, x, y in self._metadata_df. loc[:, ['patient', 'node', 'x_coord', 'y_coord']].itertuples( index=False, name=None) ] # Extract splits # Note that the hospital numbering here is different from what's in the paper, # where to avoid confusing readers we used a 1-indexed scheme and just labeled the test hospital as 5. # Here, the numbers are 0-indexed. test_center = 2 val_center = 1 self._split_dict = {'train': 0, 'id_val': 1, 'test': 2, 'val': 3} self._split_names = { 'train': 'Train', 'id_val': 'Validation (ID)', 'test': 'Test', 'val': 'Validation (OOD)', } centers = self._metadata_df['center'].values.astype('long') num_centers = int(np.max(centers)) + 1 val_center_mask = (self._metadata_df['center'] == val_center) test_center_mask = (self._metadata_df['center'] == test_center) self._metadata_df.loc[val_center_mask, 'split'] = self.split_dict['val'] self._metadata_df.loc[test_center_mask, 'split'] = self.split_dict['test'] self._split_scheme = split_scheme if self._split_scheme == 'official': pass elif self._split_scheme == 'in-dist': # For the in-distribution oracle, # we move slide 23 (corresponding to patient 042, node 3 in the original dataset) # from the test set to the training set slide_mask = (self._metadata_df['slide'] == 23) self._metadata_df.loc[slide_mask, 'split'] = self.split_dict['train'] else: raise ValueError( f'Split scheme {self._split_scheme} not recognized') self._split_array = self._metadata_df['split'].values self._metadata_array = torch.stack( (torch.LongTensor(centers), torch.LongTensor( self._metadata_df['slide'].values), self._y_array), dim=1) self._metadata_fields = ['hospital', 'slide', 'y'] self._eval_grouper = CombinatorialGrouper(dataset=self, groupby_fields=['slide']) self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)
def __init__(self, root_dir='data', download=False, split_scheme='official', oracle_training_set=False, seed=111, use_ood_val=False): self._compressed_size = 70_000_000_000 self._data_dir = self.initialize_data_dir(root_dir, download) self._split_dict = { 'train': 0, 'id_val': 1, 'id_test': 2, 'val': 3, 'test': 4 } self._split_names = { 'train': 'Train', 'id_val': 'ID Val', 'id_test': 'ID Test', 'val': 'OOD Val', 'test': 'OOD Test' } if split_scheme == 'official': split_scheme = 'time_after_2016' self._split_scheme = split_scheme self.oracle_training_set = oracle_training_set self.root = Path(self._data_dir) self.seed = int(seed) self._original_resolution = (224, 224) self.category_to_idx = {cat: i for i, cat in enumerate(categories)} self.metadata = pd.read_csv(self.root / 'rgb_metadata.csv') country_codes_df = pd.read_csv(self.root / 'country_code_mapping.csv') countrycode_to_region = { k: v for k, v in zip(country_codes_df['alpha-3'], country_codes_df['region']) } regions = [ countrycode_to_region.get(code, 'Other') for code in self.metadata['country_code'].to_list() ] self.metadata['region'] = regions all_countries = self.metadata['country_code'] self.num_chunks = 101 self.chunk_size = len(self.metadata) // (self.num_chunks - 1) if self._split_scheme.startswith('time_after'): year = int(self._split_scheme.split('_')[2]) year_dt = datetime.datetime(year, 1, 1, tzinfo=pytz.UTC) self.test_ood_mask = np.asarray( pd.to_datetime(self.metadata['timestamp']) >= year_dt) # use 3 years of the training set as validation year_minus_3_dt = datetime.datetime(year - 3, 1, 1, tzinfo=pytz.UTC) self.val_ood_mask = np.asarray( pd.to_datetime(self.metadata['timestamp']) >= year_minus_3_dt ) & ~self.test_ood_mask self.ood_mask = self.test_ood_mask | self.val_ood_mask else: raise ValueError( f"Not supported: self._split_scheme = {self._split_scheme}") self._split_array = -1 * np.ones(len(self.metadata)) for split in self._split_dict.keys(): idxs = np.arange(len(self.metadata)) if split == 'test': test_mask = np.asarray(self.metadata['split'] == 'test') idxs = idxs[self.test_ood_mask & test_mask] elif split == 'val': val_mask = np.asarray(self.metadata['split'] == 'val') idxs = idxs[self.val_ood_mask & val_mask] elif split == 'id_test': test_mask = np.asarray(self.metadata['split'] == 'test') idxs = idxs[~self.ood_mask & test_mask] elif split == 'id_val': val_mask = np.asarray(self.metadata['split'] == 'val') idxs = idxs[~self.ood_mask & val_mask] else: split_mask = np.asarray(self.metadata['split'] == split) idxs = idxs[~self.ood_mask & split_mask] if self.oracle_training_set and split == 'train': test_mask = np.asarray(self.metadata['split'] == 'test') unused_ood_idxs = np.arange(len(self.metadata))[self.ood_mask & ~test_mask] subsample_unused_ood_idxs = subsample_idxs(unused_ood_idxs, num=len(idxs) // 2, seed=self.seed + 2) subsample_train_idxs = subsample_idxs(idxs.copy(), num=len(idxs) // 2, seed=self.seed + 3) idxs = np.concatenate( [subsample_unused_ood_idxs, subsample_train_idxs]) self._split_array[idxs] = self._split_dict[split] if not use_ood_val: self._split_dict = { 'train': 0, 'val': 1, 'id_test': 2, 'ood_val': 3, 'test': 4 } self._split_names = { 'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test' } # filter out sequestered images from full dataset seq_mask = np.asarray(self.metadata['split'] == 'seq') # take out the sequestered images self._split_array = self._split_array[~seq_mask] self.full_idxs = np.arange(len(self.metadata))[~seq_mask] self._y_array = np.asarray( [self.category_to_idx[y] for y in list(self.metadata['category'])]) self.metadata['y'] = self._y_array self._y_array = torch.from_numpy(self._y_array).long()[~seq_mask] self._y_size = 1 self._n_classes = 62 # convert region to idxs all_regions = list(self.metadata['region'].unique()) region_to_region_idx = { region: i for i, region in enumerate(all_regions) } self._metadata_map = {'region': all_regions} region_idxs = [ region_to_region_idx[region] for region in self.metadata['region'].tolist() ] self.metadata['region'] = region_idxs # make a year column in metadata year_array = -1 * np.ones(len(self.metadata)) ts = pd.to_datetime(self.metadata['timestamp']) for year in range(2002, 2018): year_mask = np.asarray(ts >= datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)) \ & np.asarray(ts < datetime.datetime(year+1, 1, 1, tzinfo=pytz.UTC)) year_array[year_mask] = year - 2002 self.metadata['year'] = year_array self._metadata_map['year'] = list(range(2002, 2018)) self._metadata_fields = ['region', 'year', 'y'] self._metadata_array = torch.from_numpy(self.metadata[ self._metadata_fields].astype(int).to_numpy()).long()[~seq_mask] self._eval_groupers = { 'year': CombinatorialGrouper(dataset=self, groupby_fields=['year']), 'region': CombinatorialGrouper(dataset=self, groupby_fields=['region']), } self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)
class AmazonDataset(WILDSDataset): """ Amazon dataset. This is a modified version of the 2018 Amazon Reviews dataset. Supported `split_scheme`: 'official': official split, which is equivalent to 'user' 'user': shifts to unseen reviewers 'time': shifts from reviews written before 2013 to reviews written after 2013 'category_subpopulation': the training distribution is a random subset following the natural distribution, and the evaluation splits include each category uniformly (to the extent it is possible) '*_generalization': domain generalization setting where the domains are categories. train categories vary. '*_baseline': oracle baseline splits for user or time shifts Input (x): Review text of maximum token length of 512. Label (y): y is the star rating (0,1,2,3,4 corresponding to 1-5 stars) Metadata: reviewer: reviewer ID year: year in which the review was written category: product category product: product ID Website: https://nijianmo.github.io/amazon/index.html Original publication: @inproceedings{ni2019justifying, author = {J. Ni and J. Li and J. McAuley}, booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, pages = {188--197}, title = {Justifying recommendations using distantly-labeled reviews and fine-grained aspects}, year = {2019}, } License: None. However, the original authors request that the data be used for research purposes only. """ def __init__(self, root_dir='data', download=False, split_scheme='official'): # set variables self._dataset_name = 'amazon' self._version = '1.0' self._download_url = '' # REMOVED TO KEEP ANONYMITY self._compressed_size = 4_066_541_568 # the official split is the user split if split_scheme=='official': split_scheme = 'user' self._split_scheme = split_scheme self._y_type = 'long' self._y_size = 1 self._n_classes = 5 # path self._data_dir = self.initialize_data_dir(root_dir, download) # Load data data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'), dtype={'reviewerID':str, 'asin':str, 'reviewTime':str,'unixReviewTime':int, 'reviewText':str,'summary':str,'verified':bool,'category':str, 'reviewYear':int}, keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC) split_df = pd.read_csv(os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv')) is_in_dataset = split_df['split']!=NOT_IN_DATASET split_df = split_df[is_in_dataset] data_df = data_df[is_in_dataset] # Get arrays self._split_array = split_df['split'].values self._input_array = list(data_df['reviewText']) # Get metadata self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata(data_df, self.split_array) # Get y from metadata self._y_array = getattr(self.metadata_array[:,self.metadata_fields.index('y')], self._y_type)() # Set split info self.initialize_split_dicts() # eval self.initialize_eval_grouper() self._metric = Accuracy() super().__init__(root_dir, download, split_scheme) def get_input(self, idx): return self._input_array[idx] def eval(self, y_pred, y_true, metadata): if self.split_scheme=='user': # first compute groupwise accuracies g = self._eval_grouper.metadata_to_group(metadata) results = { **self._metric.compute(y_pred, y_true), **self._metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups) } accs = [] for group_idx in range(self._eval_grouper.n_groups): group_str = self._eval_grouper.group_field_str(group_idx) group_metric = results.pop(self._metric.group_metric_field(group_idx)) group_counts = results.pop(self._metric.group_count_field(group_idx)) results[f'{self._metric.name}_{group_str}'] = group_metric results[f'count_{group_str}'] = group_counts if group_counts>0: accs.append(group_metric) accs = np.array(accs) results['10th_percentile_acc'] = np.percentile(accs, 10) results[f'{self._metric.worst_group_metric_field}'] = self._metric.worst(accs) results_str = ( f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n" f"10th percentile {self._metric.name}: {results['10th_percentile_acc']:.3f}\n" f"Worst-group {self._metric.name}: {results[self._metric.worst_group_metric_field]:.3f}\n" ) return results, results_str else: return self.standard_group_eval( self._metric, self._eval_grouper, y_pred, y_true, metadata) def initialize_split_dicts(self): if self.split_scheme in ('user', 'time') or self.split_scheme.endswith('_generalization'): #category generalization self._split_dict = {'train': 0, 'val': 1, 'id_val': 2, 'test': 3, 'id_test': 4} self._split_names = {'train': 'Train', 'val': 'Validation (OOD)', 'id_val': 'Validation (ID)', 'test':'Test (OOD)', 'id_test': 'Test (ID)'} elif self.split_scheme in ('category_subpopulation', ): # use defaults pass elif self.split_scheme.endswith('_baseline'): # use defaults pass else: raise ValueError(f'Split scheme {self.split_scheme} not recognized') def load_metadata(self, data_df, split_array): # Get metadata columns = ['reviewerID','asin','category','reviewYear', 'overall'] metadata_fields = ['user', 'product', 'category', 'year','y'] metadata_df = data_df[columns].copy() metadata_df.columns = metadata_fields sort_idx = np.argsort(split_array) ordered_maps = {} for field in ['user', 'product', 'category']: # map to IDs in the order of split values ordered_maps[field] = pd.unique(metadata_df.iloc[sort_idx][field]) ordered_maps['y'] = range(1,6) ordered_maps['year'] = range(metadata_df['year'].min(), metadata_df['year'].max()+1) metadata_map, metadata = map_to_id_array(metadata_df, ordered_maps) return metadata_fields, torch.from_numpy(metadata.astype('long')), metadata_map def initialize_eval_grouper(self): if self.split_scheme=='user': self._eval_grouper = CombinatorialGrouper( dataset=self, groupby_fields=['user']) elif self.split_scheme.endswith('generalization') or self.split_scheme=='category_subpopulation': self._eval_grouper = CombinatorialGrouper( dataset=self, groupby_fields=['category']) elif self.split_scheme in ('time', 'time_baseline'): self._eval_grouper = CombinatorialGrouper( dataset=self, groupby_fields=['year']) elif self.split_scheme.endswith('_baseline'): # user baselines self._eval_grouper = CombinatorialGrouper( dataset=self, groupby_fields=['user']) else: raise ValueError(f'Split scheme {self.split_scheme} not recognized')
def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'cmnist4' self._version = '1.0' self._data_dir = self.initialize_data_dir(root_dir, download) self._original_resolution = (28, 28) # Read in metadata self._metadata_df = pd.read_csv( os.path.join(self._data_dir, 'metadata.csv'), index_col=0, # dtype={'patient': 'str'} ) # Get the y values self._y_array = torch.LongTensor(self._metadata_df['digit'].values) self._y_array = (self._y_array == 6) + (self._y_array == 9) * 2 self._y_size = 3 self._n_classes = 3 # Get filenames self._input_array = [ f'images/env_{env}/digit_{digit}/{image}.pt' for image, digit, env in self._metadata_df. loc[:, ['image', 'digit', 'env']].itertuples(index=False, name=None) ] # Extract splits # Note that the hospital numbering here is different from what's in the paper, # where to avoid confusing readers we used a 1-indexed scheme and just labeled the test hospital as 5. # Here, the numbers are 0-indexed. test_env = 4 val_env = 3 self._split_dict = {'train': 0, 'id_val': 1, 'test': 2, 'val': 3} self._split_names = { 'train': 'Train', 'id_val': 'Validation (ID)', 'test': 'Test', 'val': 'Validation (OOD)', } envs = self._metadata_df['env'].values.astype('long') val_env_mask = (self._metadata_df['env'] == val_env) test_env_mask = (self._metadata_df['env'] == test_env) self._metadata_df.loc[val_env_mask, 'split'] = self.split_dict['val'] self._metadata_df.loc[test_env_mask, 'split'] = self.split_dict['test'] self._split_scheme = split_scheme if self._split_scheme != 'official': raise ValueError( f'Split scheme {self._split_scheme} not recognized') self._split_array = self._metadata_df['split'].values self._metadata_array = torch.stack( [torch.LongTensor(envs), self._y_array], dim=1) self._metadata_fields = ['env', 'y'] self._eval_grouper = CombinatorialGrouper(dataset=self, groupby_fields=['env']) self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)
'ogb-molpcba': OGBPCBADataset, 'poverty': PovertyMapDataset, 'fmow': FMoWDataset, 'bdd100k': BDD100KDataset, } losses = { 'cross_entropy': ElementwiseLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')), 'mse': MSE(name='loss'), 'multitask_bce': MultiTaskLoss(loss_fn=nn.BCEWithLogitsLoss(reduction='none')), } algo_log_metrics = { 'accuracy': Accuracy(), 'mse': MSE(), 'multitask_accuracy': MultiTaskAccuracy(), None: None, } # see initialize_*() functions for correspondence transforms = [ 'bert', 'image_base', 'image_resize_and_center_crop', 'poverty_train' ] models = [ 'resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50', 'densenet121', 'bert-base-uncased', 'gin-virtual', 'logistic_regression' ] algorithms = ['ERM', 'groupDRO', 'deepCORAL', 'IRM'] optimizers = ['SGD', 'Adam', 'AdamW']
def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'iwildcam' self._version = '1.0' self._split_scheme = split_scheme if self._split_scheme != 'official': raise ValueError( f'Split scheme {self._split_scheme} not recognized') # path self._download_url = '' self._compressed_size = 90_094_666_806 self._data_dir = Path(self.initialize_data_dir(root_dir, download)) # Load splits train_df = pd.read_csv(self._data_dir / 'train.csv') val_trans_df = pd.read_csv(self._data_dir / 'val_trans.csv') test_trans_df = pd.read_csv(self._data_dir / 'test_trans.csv') val_cis_df = pd.read_csv(self._data_dir / 'val_cis.csv') test_cis_df = pd.read_csv(self._data_dir / 'test_cis.csv') # Merge all dfs train_df['split'] = 'train' val_trans_df['split'] = 'val' test_trans_df['split'] = 'test' val_cis_df['split'] = 'id_val' test_cis_df['split'] = 'id_test' df = pd.concat( [train_df, val_trans_df, test_trans_df, test_cis_df, val_cis_df]) # Splits data = {} self._split_dict = { 'train': 0, 'val': 1, 'test': 2, 'id_val': 3, 'id_test': 4 } self._split_names = { 'train': 'Train', 'val': 'Validation (OOD/Trans)', 'test': 'Test (OOD/Trans)', 'id_val': 'Validation (ID/Cis)', 'id_test': 'Test (ID/Cis)' } df['split_id'] = df['split'].apply(lambda x: self._split_dict[x]) self._split_array = df['split_id'].values # Filenames self._input_array = df['filename'].values # Labels unique_categories = np.unique(df['category_id']) self._n_classes = len(unique_categories) category_to_label = dict([ (i, j) for i, j in zip(unique_categories, range(self._n_classes)) ]) label_to_category = dict([(v, k) for k, v in category_to_label.items()]) self._y_array = torch.tensor( df['category_id'].apply(lambda x: category_to_label[x]).values) self._y_size = 1 # Location/group info location_ids = df['location'] locations = np.unique(location_ids) n_groups = len(locations) location_to_group_id = {locations[i]: i for i in range(n_groups)} df['group_id'] = df['location'].apply( lambda x: location_to_group_id[x]) self._n_groups = n_groups self._metadata_array = torch.tensor( np.stack([df['group_id'].values, self.y_array], axis=1)) self._metadata_fields = ['location', 'y'] # eval grouper self._eval_grouper = CombinatorialGrouper(dataset=self, groupby_fields=(['location' ])) self._metrics = [ Accuracy(), Recall(average='macro'), Recall(average='weighted'), F1(average='macro'), F1(average='weighted') ] super().__init__(root_dir, download, split_scheme)
def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'vlcs' self._version = '1.0' # self._download_url = 'https://drive.google.com/uc?id=1skwblH1_okBwxWxmRsp9_qi15hyPpxg8' self._data_dir = self.initialize_data_dir(root_dir, download) self._resolution = (224, 224) # Read in metadata self._metadata_df = pd.read_csv( os.path.join(self._data_dir, 'metadata.csv'), index_col=0 ) # Get the y values self._label_map = { 'bird': 0, 'car': 1, 'chair': 2, 'dog': 3, 'person': 4 } self._label_array = self._metadata_df['label'].values self._y_array = torch.LongTensor([self._label_map[y] for y in self._label_array]) self._y_size = 1 self._n_classes = 5 # Get filenames self._input_array = [ f'{env}/{label}/{image}' for image, label, env in self._metadata_df.loc[:, ['image', 'label', 'env']].itertuples(index=False, name=None)] test_env = '' #'VOC2007' val_env = 'VOC2007' self._split_dict = { 'train': 0, 'id_val': 1, 'test': 2, 'val': 3 } self._split_names = { 'train': 'Train', 'id_val': 'Validation (ID)', 'test': 'Test', 'val': 'Validation (OOD)', } env_map = { 'SUN09': 0, 'LabelMe': 1, 'Caltech101': 2, 'VOC2007': 3 } env_names = self._metadata_df['env'].values envs = [env_map[name] for name in env_names] val_env_mask = (self._metadata_df['env'] == val_env) test_env_mask = (self._metadata_df['env'] == test_env) self._metadata_df.loc[val_env_mask, 'split'] = self.split_dict['val'] self._metadata_df.loc[test_env_mask, 'split'] = self.split_dict['test'] self._split_scheme = split_scheme if self._split_scheme != 'official': raise ValueError(f'Split scheme {self._split_scheme} not recognized') self._split_array = self._metadata_df['split'].values self._metadata_array = torch.stack( (torch.LongTensor(envs), self._y_array), dim=1) self._metadata_fields = ['env', 'y'] self._eval_grouper = CombinatorialGrouper( dataset=self, groupby_fields=['env']) self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)
class CivilCommentsDataset(WILDSDataset): """ The CivilComments-wilds toxicity classification dataset. This is a modified version of the original CivilComments dataset. Supported `split_scheme`: 'official' Input (x): A comment on an online article, comprising one or more sentences of text. Label (y): y is binary. It is 1 if the comment was been rated as toxic by a majority of the crowdworkers who saw that comment, and 0 otherwise. Metadata: Each comment is annotated with the following binary indicators: - male - female - LGBTQ - christian - muslim - other_religions - black - white - identity_any - severe_toxicity - obscene - threat - insult - identity_attack - sexual_explicit Website: https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification Original publication: @inproceedings{borkan2019nuanced, title={Nuanced metrics for measuring unintended bias with real data for text classification}, author={Borkan, Daniel and Dixon, Lucas and Sorensen, Jeffrey and Thain, Nithum and Vasserman, Lucy}, booktitle={Companion Proceedings of The 2019 World Wide Web Conference}, pages={491--500}, year={2019} } License: This dataset is in the public domain and is distributed under CC0. https://creativecommons.org/publicdomain/zero/1.0/ """ def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'civilcomments' self._version = '1.0' self._download_url = '' self._compressed_size = 90_644_480 self._data_dir = self.initialize_data_dir(root_dir, download) # Read in metadata self._metadata_df = pd.read_csv(os.path.join( self._data_dir, 'all_data_with_identities.csv'), index_col=0) # Get the y values self._y_array = torch.LongTensor( self._metadata_df['toxicity'].values >= 0.5) self._y_size = 1 self._n_classes = 2 # Extract text self._text_array = list(self._metadata_df['comment_text']) # Extract splits self._split_scheme = split_scheme if self._split_scheme != 'official': raise ValueError( f'Split scheme {self._split_scheme} not recognized') # metadata_df contains split names in strings, so convert them to ints for split in self.split_dict: split_indices = self._metadata_df['split'] == split self._metadata_df.loc[split_indices, 'split'] = self.split_dict[split] self._split_array = self._metadata_df['split'].values # Extract metadata self._identity_vars = [ 'male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white' ] self._auxiliary_vars = [ 'identity_any', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit' ] self._metadata_array = torch.cat( (torch.LongTensor( (self._metadata_df.loc[:, self._identity_vars] >= 0.5).values), torch.LongTensor((self._metadata_df.loc[:, self._auxiliary_vars] >= 0.5).values), self._y_array.reshape( (-1, 1))), dim=1) self._metadata_fields = self._identity_vars + self._auxiliary_vars + [ 'y' ] self._eval_groupers = [ CombinatorialGrouper(dataset=self, groupby_fields=[identity_var, 'y']) for identity_var in self._identity_vars ] self._metric = Accuracy() super().__init__(root_dir, download, split_scheme) def get_input(self, idx): return self._text_array[idx] def eval(self, y_pred, y_true, metadata): results = { **self._metric.compute(y_pred, y_true), } results_str = f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n" # Each eval_grouper is over label + a single identity # We only want to keep the groups where the identity is positive # The groups are: # Group 0: identity = 0, y = 0 # Group 1: identity = 1, y = 0 # Group 2: identity = 0, y = 1 # Group 3: identity = 1, y = 1 # so this means we want only groups 1 and 3. worst_group_metric = None for identity_var, eval_grouper in zip(self._identity_vars, self._eval_groupers): g = eval_grouper.metadata_to_group(metadata) group_results = { **self._metric.compute_group_wise(y_pred, y_true, g, eval_grouper.n_groups) } results_str += f" {identity_var:20s}" for group_idx in range(eval_grouper.n_groups): group_str = eval_grouper.group_field_str(group_idx) if f'{identity_var}:1' in group_str: group_metric = group_results[ self._metric.group_metric_field(group_idx)] group_counts = group_results[ self._metric.group_count_field(group_idx)] results[f'{self._metric.name}_{group_str}'] = group_metric results[f'count_{group_str}'] = group_counts if f'y:0' in group_str: label_str = 'non_toxic' else: label_str = 'toxic' results_str += ( f" {self._metric.name} on {label_str}: {group_metric:.3f}" f" (n = {results[f'count_{group_str}']:6.0f}) ") if worst_group_metric is None: worst_group_metric = group_metric else: worst_group_metric = self._metric.worst( [worst_group_metric, group_metric]) results_str += f"\n" results[ f'{self._metric.worst_group_metric_field}'] = worst_group_metric results_str += f"Worst-group {self._metric.name}: {worst_group_metric:.3f}\n" return results, results_str
def __init__(self, root_dir='data', download=False, split_scheme='official'): self._dataset_name = 'civilcomments' self._version = '1.0' self._download_url = '' self._compressed_size = 90_644_480 self._data_dir = self.initialize_data_dir(root_dir, download) # Read in metadata self._metadata_df = pd.read_csv(os.path.join( self._data_dir, 'all_data_with_identities.csv'), index_col=0) # Get the y values self._y_array = torch.LongTensor( self._metadata_df['toxicity'].values >= 0.5) self._y_size = 1 self._n_classes = 2 # Extract text self._text_array = list(self._metadata_df['comment_text']) # Extract splits self._split_scheme = split_scheme if self._split_scheme != 'official': raise ValueError( f'Split scheme {self._split_scheme} not recognized') # metadata_df contains split names in strings, so convert them to ints for split in self.split_dict: split_indices = self._metadata_df['split'] == split self._metadata_df.loc[split_indices, 'split'] = self.split_dict[split] self._split_array = self._metadata_df['split'].values # Extract metadata self._identity_vars = [ 'male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white' ] self._auxiliary_vars = [ 'identity_any', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit' ] self._metadata_array = torch.cat( (torch.LongTensor( (self._metadata_df.loc[:, self._identity_vars] >= 0.5).values), torch.LongTensor((self._metadata_df.loc[:, self._auxiliary_vars] >= 0.5).values), self._y_array.reshape( (-1, 1))), dim=1) self._metadata_fields = self._identity_vars + self._auxiliary_vars + [ 'y' ] self._eval_groupers = [ CombinatorialGrouper(dataset=self, groupby_fields=[identity_var, 'y']) for identity_var in self._identity_vars ] self._metric = Accuracy() super().__init__(root_dir, download, split_scheme)