def __init__(self, base_dataset: Dataset, validation_dataset: Dataset, base_network: torch.nn.Module, device: torch.device, scale: int = 1.0): super(PULearningPHEMEDataset, self).__init__() # Subset is used as the dataset if type(base_dataset) == torch.utils.data.Subset: self.tokenizer = base_dataset.dataset.tokenizer indices = base_dataset.indices orig_dataset = base_dataset.dataset.dataset.copy() base_dataset = base_dataset.dataset base_dataset.dataset = orig_dataset.iloc[indices] base_dataset.dataset = base_dataset.dataset.reset_index(drop=True) else: self.tokenizer = base_dataset.tokenizer # Only look at negative samples original_dataset = base_dataset.dataset.copy() base_dataset.dataset = base_dataset.dataset[ base_dataset.dataset['label'] == 0] # Set the label to 1 base_dataset.dataset['label'] = [1] * base_dataset.dataset.shape[0] # Get negatives weight, combine into one dataset and duplicate the negatives train_dl = torch.utils.data.DataLoader( base_dataset, batch_size=8, collate_fn=collate_batch_transformer) val_dl = torch.utils.data.DataLoader( validation_dataset, batch_size=8, collate_fn=collate_batch_transformer) neg_weights = get_negative_sample_weights(train_dl, val_dl, base_network, device) assert neg_weights.shape == base_dataset.dataset.shape, "Should have double the number of negative sample weights" weights = np.asarray([0.] * original_dataset.shape[0]) weights[original_dataset.index[original_dataset['label'] == 1].tolist()] = 1. weights[original_dataset.index[original_dataset['label'] == 0].tolist()] = neg_weights[:, 0] original_dataset['weight'] = weights duplicated_data = base_dataset.dataset.copy() duplicated_data['weight'] = neg_weights[:, 1] self.dataset = pd.concat([original_dataset, duplicated_data], ignore_index=True) self.scale = scale
def __init__(self, base_dataset: Dataset, validation_dataset: Dataset, base_network: torch.nn.Module, device: torch.device, gamma: float = 1.0, scale: int = 1.0): super(PULearningPriorBasedConversionPHEMEDataset, self).__init__() train_dl = torch.utils.data.DataLoader( base_dataset, batch_size=8, collate_fn=collate_batch_transformer) val_dl = torch.utils.data.DataLoader( validation_dataset, batch_size=8, collate_fn=collate_batch_transformer) prior = estimate_class_prior_probability(base_network, train_dl, val_dl, device) print(prior) # Subset is used as the dataset if type(base_dataset) == torch.utils.data.Subset: self.tokenizer = base_dataset.dataset.tokenizer indices = base_dataset.indices orig_dataset = base_dataset.dataset.dataset.copy() base_dataset = base_dataset.dataset base_dataset.dataset = orig_dataset.iloc[indices] base_dataset.dataset = base_dataset.dataset.reset_index(drop=True) else: self.tokenizer = base_dataset.tokenizer # Only look at negative samples original_dataset = base_dataset.dataset.copy() base_dataset.dataset = base_dataset.dataset[ base_dataset.dataset['label'] == 0] # Get negatives weight, combine into one dataset and duplicate the negatives train_dl = torch.utils.data.DataLoader( base_dataset, batch_size=8, collate_fn=collate_batch_transformer) val_dl = torch.utils.data.DataLoader( validation_dataset, batch_size=8, collate_fn=collate_batch_transformer) neg_weights = get_negative_sample_weights(train_dl, val_dl, base_network, device) assert neg_weights.shape == base_dataset.dataset.shape, "Should have double the number of negative sample weights" positives = original_dataset[original_dataset['label'] == 1] positives['weight'] = [1.] * positives.shape[0] # Keep adding examples until p(y=1) equals our estimate keep_examples = np.asarray([True] * neg_weights.shape[0]) ordered_idx = np.argsort(neg_weights[:, 1])[::-1] i = 0 while (positives.shape[0] + sum(~keep_examples)) / original_dataset.shape[0] < prior: keep_examples[ordered_idx[i]] = False i += 1 kept_negatives = base_dataset.dataset[keep_examples].copy() kept_negatives_plus = kept_negatives.copy() kept_negatives_plus['label'] = [1] * kept_negatives_plus.shape[0] kept_negatives['weight'] = neg_weights[keep_examples, 0] kept_negatives_plus['weight'] = neg_weights[keep_examples, 1] converted_positives = base_dataset.dataset[~keep_examples].copy() converted_positives['label'] = [1] * converted_positives.shape[0] converted_positives['weight'] = [1.] * converted_positives.shape[0] print(positives.shape) print(kept_negatives.shape) print(converted_positives.shape) self.dataset = pd.concat([ positives, kept_negatives, kept_negatives_plus, converted_positives ], ignore_index=True) self.scale = scale