예제 #1
0
    def __init__(self,
                 base_dataset: Dataset,
                 validation_dataset: Dataset,
                 base_network: torch.nn.Module,
                 device: torch.device,
                 scale: int = 1.0):
        super(PULearningPHEMEDataset, self).__init__()
        # Subset is used as the dataset
        if type(base_dataset) == torch.utils.data.Subset:
            self.tokenizer = base_dataset.dataset.tokenizer
            indices = base_dataset.indices
            orig_dataset = base_dataset.dataset.dataset.copy()
            base_dataset = base_dataset.dataset
            base_dataset.dataset = orig_dataset.iloc[indices]
            base_dataset.dataset = base_dataset.dataset.reset_index(drop=True)
        else:
            self.tokenizer = base_dataset.tokenizer

        # Only look at negative samples
        original_dataset = base_dataset.dataset.copy()
        base_dataset.dataset = base_dataset.dataset[
            base_dataset.dataset['label'] == 0]
        # Set the label to 1
        base_dataset.dataset['label'] = [1] * base_dataset.dataset.shape[0]

        # Get negatives weight, combine into one dataset and duplicate the negatives
        train_dl = torch.utils.data.DataLoader(
            base_dataset, batch_size=8, collate_fn=collate_batch_transformer)
        val_dl = torch.utils.data.DataLoader(
            validation_dataset,
            batch_size=8,
            collate_fn=collate_batch_transformer)
        neg_weights = get_negative_sample_weights(train_dl, val_dl,
                                                  base_network, device)
        assert neg_weights.shape == base_dataset.dataset.shape, "Should have double the number of negative sample weights"
        weights = np.asarray([0.] * original_dataset.shape[0])
        weights[original_dataset.index[original_dataset['label'] ==
                                       1].tolist()] = 1.
        weights[original_dataset.index[original_dataset['label'] ==
                                       0].tolist()] = neg_weights[:, 0]
        original_dataset['weight'] = weights
        duplicated_data = base_dataset.dataset.copy()
        duplicated_data['weight'] = neg_weights[:, 1]
        self.dataset = pd.concat([original_dataset, duplicated_data],
                                 ignore_index=True)
        self.scale = scale
예제 #2
0
    def __init__(self,
                 base_dataset: Dataset,
                 validation_dataset: Dataset,
                 base_network: torch.nn.Module,
                 device: torch.device,
                 gamma: float = 1.0,
                 scale: int = 1.0):
        super(PULearningPriorBasedConversionPHEMEDataset, self).__init__()
        train_dl = torch.utils.data.DataLoader(
            base_dataset, batch_size=8, collate_fn=collate_batch_transformer)
        val_dl = torch.utils.data.DataLoader(
            validation_dataset,
            batch_size=8,
            collate_fn=collate_batch_transformer)
        prior = estimate_class_prior_probability(base_network, train_dl,
                                                 val_dl, device)
        print(prior)
        # Subset is used as the dataset
        if type(base_dataset) == torch.utils.data.Subset:
            self.tokenizer = base_dataset.dataset.tokenizer
            indices = base_dataset.indices
            orig_dataset = base_dataset.dataset.dataset.copy()
            base_dataset = base_dataset.dataset
            base_dataset.dataset = orig_dataset.iloc[indices]
            base_dataset.dataset = base_dataset.dataset.reset_index(drop=True)
        else:
            self.tokenizer = base_dataset.tokenizer

        # Only look at negative samples
        original_dataset = base_dataset.dataset.copy()
        base_dataset.dataset = base_dataset.dataset[
            base_dataset.dataset['label'] == 0]

        # Get negatives weight, combine into one dataset and duplicate the negatives
        train_dl = torch.utils.data.DataLoader(
            base_dataset, batch_size=8, collate_fn=collate_batch_transformer)
        val_dl = torch.utils.data.DataLoader(
            validation_dataset,
            batch_size=8,
            collate_fn=collate_batch_transformer)
        neg_weights = get_negative_sample_weights(train_dl, val_dl,
                                                  base_network, device)
        assert neg_weights.shape == base_dataset.dataset.shape, "Should have double the number of negative sample weights"

        positives = original_dataset[original_dataset['label'] == 1]
        positives['weight'] = [1.] * positives.shape[0]
        # Keep adding examples until p(y=1) equals our estimate
        keep_examples = np.asarray([True] * neg_weights.shape[0])
        ordered_idx = np.argsort(neg_weights[:, 1])[::-1]
        i = 0
        while (positives.shape[0] +
               sum(~keep_examples)) / original_dataset.shape[0] < prior:
            keep_examples[ordered_idx[i]] = False
            i += 1
        kept_negatives = base_dataset.dataset[keep_examples].copy()
        kept_negatives_plus = kept_negatives.copy()
        kept_negatives_plus['label'] = [1] * kept_negatives_plus.shape[0]
        kept_negatives['weight'] = neg_weights[keep_examples, 0]
        kept_negatives_plus['weight'] = neg_weights[keep_examples, 1]
        converted_positives = base_dataset.dataset[~keep_examples].copy()
        converted_positives['label'] = [1] * converted_positives.shape[0]
        converted_positives['weight'] = [1.] * converted_positives.shape[0]

        print(positives.shape)
        print(kept_negatives.shape)
        print(converted_positives.shape)
        self.dataset = pd.concat([
            positives, kept_negatives, kept_negatives_plus, converted_positives
        ],
                                 ignore_index=True)
        self.scale = scale