Python EmailDataset.split 예제들, data_reader.dataset.EmailDataset.split Python 예제들

예제 #1

0

파일 보기

def data():
    dataset = EmailDataset(
        path='./data_reader/data/test/100_instance_debug.csv', raw=False)
    # set a seed so we get the same output every time
    seed(1)
    training_data, testing_data = dataset.split({'train': 60, 'test': 40})
    return {'training_data': training_data, 'testing_data': testing_data}

예제 #2

0

파일 보기

def data():
    dataset = EmailDataset(
        path='./data_reader/data/test/100_instance_debug.csv', raw=False)
    training_, testing_ = dataset.split({'train': 60, 'test': 40})
    training_data = load_dataset(training_)
    testing_data = load_dataset(testing_)
    return {'training_data': training_data, 'testing_data': testing_data}

예제 #3

0

파일 보기

def main(argv):
    """
    driver class that performs demo of the library
    """

    # pre-process data and randomly partition
    dataset = EmailDataset(
        path='../../data_reader/data/test/100_instance_debug.csv', raw=False)
    training_, testing_ = dataset.split({'train': 60, 'test': 40})
    training_data = load_dataset(training_)
    testing_data = load_dataset(testing_)

    # initialize and train RobustLearner
    clf2 = learner.FeatureDeletion(training_data, {
        'hinge_loss_multiplier': 1,
        'max_feature_deletion': 30
    })
    clf2.train()

    # produce simple metrics
    y_predict = clf2.predict(testing_data[0])
    y_true = testing_data[0].label
    print(y_predict, y_true)

    score = metrics.accuracy_score([y_true], [y_predict])
    print("score = " + str(score))

    wgt = clf2.decision_function()[0].tolist()[0]
    print(wgt)
    yaxis = [i for i in range(clf2.num_features)]
    plt.plot(yaxis, wgt)
    plt.show()

예제 #4

0

파일 보기

파일: svm_restrained_test.py 프로젝트: xyvivian/adlib

def data():
    dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                           binary=False,
                           raw=True)
    # set a seed so we get the same output every time
    seed(1)
    training_data, testing_data = dataset.split({'train': 60, 'test': 40})
    return {'training_data': training_data, 'testing_data': testing_data}

예제 #5

0

파일 보기

파일: cost_sensitive_test.py 프로젝트: vishalbelsare/adlib

def data():
    dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/full',
                           binary=False,
                           raw=True)
    training_, testing_ = dataset.split({'train': 60, 'test': 40})
    training_data = load_dataset(training_)
    testing_data = load_dataset(testing_)
    return {'training_data': training_data, 'testing_data': testing_data}

예제 #6

0

파일 보기

파일: good_word_400.py 프로젝트: xyvivian/adlib

    return "accuracy: {0} \n precision: {1} \n recall: {2}\n".format(
        acc, prec, rec)


def get_evasion_set(x_test, y_pred):
    # for x, y in zip(x_test, y_pred):
    #     print("true label: {0}, predicted label: {1}".format(x.label, y))
    ls = [x for x, y in zip(x_test, y_pred) if x.label == 1 and y == 1]
    print("{0} malicious instances are being detected initially")
    return ls, [x.label for x in ls]


dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                       binary=True,
                       raw=True)
training_, testing_ = dataset.split({'train': 60, 'test': 40})
training_data = load_dataset(training_)
testing_data = load_dataset(testing_)
test_true_label = [x.label for x in testing_data]

# test simple learner svm
learning_model = svm.SVC(probability=True, kernel='linear')
learner1 = SimpleLearner(learning_model, training_data)
learner1.train()

predictions = learner1.predict(testing_data)
print("======== initial prediction =========")
print(summary(predictions, test_true_label))

# Note: should not use only malicious data
attacker = GoodWord(n=500)

예제 #7

0

파일 보기

파일: dp_learner_test.py 프로젝트: vishalbelsare/adlib

    def __init__(self,
                 learner_names: List[str] or str,
                 attacker_name: str,
                 dataset: EmailDataset,
                 params: Dict = None,
                 verbose=True):
        """
        Test setup.
        :param learner_names: List of learner names or one string either 'trim', 'atrim', 'irl',
                              or 'outlier-removal'
        :param attacker_name: Either 'label-flipping', 'k-insertion', 'data-modification', or
                              'dummy'
        :param dataset: the dataset
        :param params: the params to pass to the learner - if None, defaults will be used
        :param verbose: if True, will print START and STOP and set learners and attackers to
                        verbose mode
        """

        if isinstance(learner_names, str):
            learner_names = [learner_names]
        learner_names = list(map(lambda x: x.lower(), learner_names))

        if set(learner_names) > {'trim', 'atrim', 'irl', 'outlier-removal'}:
            raise ValueError('Learner name not trim, atrim, nor irl.')

        if attacker_name.lower() not in [
                'label-flipping', 'k-insertion', 'data-modification', 'dummy'
        ]:
            raise ValueError('Attacker name not label-flipping, k-insertion, '
                             'data-modification, nor dummy.')

        self.learner_names = learner_names

        def update_lnr_names(x):
            if x == 'trim':
                x = 'TRIM Learner'
            elif x == 'atrim':
                x = 'Alternating TRIM Learner'
            elif x == 'irl':
                x = 'Iterative Retraining Learner'
            else:  # x == 'outlier-removal'
                x = 'Outlier Removal Learner'
            return x

        self.learner_names = list(map(update_lnr_names, self.learner_names))

        self.attacker_name = attacker_name.lower()
        self.params = params
        self.verbose = verbose

        training_data, testing_data = dataset.split({'train': 50, 'test': 50})
        self.training_instances = load_dataset(training_data)
        self.testing_instances = load_dataset(testing_data)

        self.learner = None  # SVM with clean dataset
        self.attack_learner = None  # SVM with attacked dataset
        self.dp_learner = None  # Learner we are testing
        self.attacker = None  # the attacker
        self.attack_instances = None  # the attacked instances

        # Before attack
        self.training_pred_labels = None  # the predicted labels of the training set for the SVM
        self.testing_pred_labels = None  # the predicted labels of the testing set for the SVM

        # After attack
        self.attack_training_pred_labels = None  # attacker predicted labels for training set SVM
        self.attack_testing_pred_labels = None  # attacker predicted labels for the testing set SVM
        self.dp_learner_training_pred_labels = None  # predicted labels for training set DP Learner
        self.dp_learner_testing_pred_labels = None  # predicted labels for the training set DP L.

        self.labels = []  # true labels
        for inst in self.training_instances + self.testing_instances:
            self.labels.append(inst.get_label())

        self.results = []  # List of result tuples

예제 #8

0

파일 보기

def test_iterative_retraining_learner():
    print()
    print(
        '###################################################################')
    print('START TRIM learner test.\n')

    begin = time.time()

    if len(sys.argv) == 2 and sys.argv[1] in [
            'label-flipping', 'k-insertion', 'data-modification'
    ]:
        attacker_name = sys.argv[1]
    else:
        attacker_name = 'label-flipping'

    # Data processing unit
    # The path is an index of 400 testing samples(raw email data).
    dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                           binary=True,
                           raw=True)

    training_data, testing_data = dataset.split({'train': 20, 'test': 80})
    training_data = load_dataset(training_data)
    testing_data = load_dataset(testing_data)

    print('Training sample size: ', len(training_data), '/400\n', sep='')

    # Setting the default learner
    learning_model = svm.SVC(probability=True, kernel='linear')
    learner = SimpleLearner(learning_model, training_data)
    learner.train()

    original_pred_labels = learner.predict(training_data)
    orig_learner = deepcopy(learner)

    # Execute the attack
    if attacker_name == 'label-flipping':
        cost = list(np.random.binomial(2, 0.5, len(training_data)))
        total_cost = 0.3 * len(training_data)  # flip around ~30% of the labels
        attacker = LabelFlipping(learner, cost, total_cost, verbose=True)
    elif attacker_name == 'k-insertion':
        number_to_add = int(0.25 * len(training_data))
        attacker = KInsertion(learner,
                              training_data[0],
                              number_to_add=number_to_add,
                              verbose=True)
    else:  # attacker_name == 'data-modification'
        lnr = orig_learner.model.learner
        eye = np.eye(training_data[0].get_feature_count(), dtype=int)
        orig_theta = lnr.decision_function(eye) - lnr.intercept_[0]
        target_theta = deepcopy(orig_theta)

        spam_instances = []
        for inst in training_data + testing_data:
            if inst.get_label() == 1:
                spam_instances.append(inst)

        spam_features, ham_features = get_spam_features(spam_instances)

        # Set features to recognize spam as ham
        for index in spam_features:
            target_theta[index] = -10

        for index in ham_features:
            target_theta[index] = 0.01

        print('Features selected: ', np.array(spam_features))
        print('Number of features: ', len(spam_features))

        attacker = DataModification(orig_learner, target_theta, verbose=True)

    print(
        '###################################################################')
    print('START', attacker_name, 'attack.\n')

    attack_data = attacker.attack(training_data)

    print('\nEND', attacker_name, 'attack.')
    print(
        '###################################################################')
    print()

    # Retrain the model with poisoned data
    learning_model = svm.SVC(probability=True, kernel='linear')
    learner = SimpleLearner(learning_model, attack_data)
    learner.train()

    print(
        '###################################################################')
    print('START Iterative Retraining learner.\n')

    ###

    print('\nEND Iterative Retraining learner.')
    print(
        '###################################################################')
    print()

    ############################################################################
    # Calculate statistics with training data

    attack_pred_labels = learner.predict(
        training_data)  # predict w/ orig label

    (orig_precent_correct, attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 training_data)

    print(
        '###################################################################')
    print('Predictions with training dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    ############################################################################
    # Calculate statistics with predict data (other half of dataset)

    original_pred_labels = orig_learner.predict(testing_data)
    attack_pred_labels = learner.predict(testing_data)

    (orig_precent_correct, attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 testing_data)

    print(
        '###################################################################')
    print('Predictions with other half of dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    ############################################################################
    # Calculate statistics with trim learner

    data = training_data + testing_data
    trim_pred_labels = trim_learner.predict(data)
    normal_pred_labels = learner.predict(data)

    (trim_percent_correct, normal_percent_correct,
     difference) = calculate_correct_percentages(trim_pred_labels,
                                                 normal_pred_labels, data)

    print(
        '###################################################################')
    print('Predictions using TRIM learner:')
    print('TRIM learner percentage: ', trim_percent_correct, '%')
    print('Simple learner correct percentage: ', normal_percent_correct, '%')
    print('Difference: ', difference, '%')

    end = time.time()
    print('\nTotal time: ', round(begin - end, 2), 's', '\n', sep='')

    print('\nEND TRIM learner test.')
    print(
        '###################################################################')
    print()

예제 #9

0

파일 보기

from adversaries.adversarial_learning import AdversarialLearning
from data_reader.dataset import EmailDataset
from data_reader.operations import load_dataset
from learners.simple_learner import SimpleLearner
from sklearn.linear_model import LinearRegression

dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                       binary=False,
                       raw=True)
training_, testing_ = dataset.split({'train': 70, 'test': 30})
training_data = load_dataset(training_)
testing_data = load_dataset(testing_)

#set learner and basic attributes
learner_model = LinearRegression()
basic_learner = SimpleLearner(model=learner_model,
                              training_instances=training_data)
basic_learner.train()

attacker = AdversarialLearning(threshold=10, learner=basic_learner)
attacker.set_adversarial_params(learner=basic_learner,
                                training_instances=training_data)
attacked_instances = attacker.attack(testing_data)

predictions1 = basic_learner.predict(testing_data)
predictions2 = basic_learner.predict(attacked_instances)

print(predictions1, predictions2)

예제 #10

0

파일 보기

파일: data_modification_test.py 프로젝트: xyvivian/adlib

def test_data_modification():
    print()
    print(
        '###################################################################')
    print('START data modification attack.\n')

    begin = time.time()

    # Data processing unit
    # The path is an index of 400 testing samples(raw email data).
    dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                           binary=False,
                           raw=True)

    training_data, predict_data = dataset.split({'train': 50, 'test': 50})
    training_data = load_dataset(training_data)
    predict_data = load_dataset(predict_data)

    print('Training sample size: ', len(training_data), '/400\n', sep='')

    # Setting the default learner
    # Test simple learner svm
    orig_learning_model = svm.SVC(probability=True, kernel='linear')
    orig_learner = SimpleLearner(orig_learning_model, training_data)
    orig_learner.train()

    ############################################################################
    # Calculate target theta, 1 -> spam, -1 -> ham; For the target theta
    # calculation, I am assuming I know which spam I want to be classified
    # as ham and the features I want to have a disproportionate effect on the
    # decision function calculation. For example, if feature #32 is something
    # that all of my spam has in common, I want to make the entry corresponding
    # to #32 (index 32 - 1 = 31) in target_theta to be disproportionately
    # negative so that when my spam is being classified, the 1 indicating that
    # feature #32 is present will be multiplied by a large negative number so as
    # to decrease the value of the decision function and hopefully make it
    # negative so as to classify my spam as ham.

    lnr = orig_learner.model.learner
    eye = np.eye(training_data[0].get_feature_count(), dtype=int)
    orig_theta = lnr.decision_function(eye) - lnr.intercept_[0]
    target_theta = deepcopy(orig_theta)

    spam_instances = []
    for inst in training_data + predict_data:
        if inst.get_label() == 1:
            spam_instances.append(inst)

    spam_features, ham_features = get_spam_features(spam_instances)

    # Set features to recognize spam as ham
    for index in spam_features:
        target_theta[index] = -10

    for index in ham_features:
        target_theta[index] = 0.01

    print('Features selected: ', np.array(spam_features))
    print('Number of features: ', len(spam_features))

    ############################################################################

    # Get original predictions
    original_pred_labels = orig_learner.predict(training_data)

    # Do the attack
    attacker = DataModification(orig_learner, target_theta, verbose=True)
    attack_data = attacker.attack(training_data)

    # Retrain the model with poisoned data
    learning_model = svm.SVC(probability=True, kernel='linear')
    learner = SimpleLearner(learning_model, attack_data)
    learner.train()

    ############################################################################
    # Calculate statistics with training data

    attack_pred_labels = learner.predict(
        training_data)  # predict w/ orig label

    (orig_precent_correct, attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 training_data)

    print(
        '###################################################################')
    print('Predictions with training dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    ############################################################################
    # Calculate statistics with predict data (other half of dataset)

    original_pred_labels = orig_learner.predict(predict_data)
    attack_pred_labels = learner.predict(predict_data)

    (orig_precent_correct, attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 predict_data)

    print(
        '###################################################################')
    print('Predictions with other half of dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    ############################################################################
    # Calculate statistics with predict data (other half of dataset)

    spam_pred_labels = learner.predict(spam_instances)
    spam_ham_count = sum(map(lambda x: 1 if x == -1 else 0, spam_pred_labels))
    print(
        '###################################################################')
    print('Number of spam instances in original training set that were \n',
          'classified as ham after the attack: ',
          spam_ham_count,
          '/',
          len(spam_instances),
          sep='')

    end = time.time()
    print('\nTotal time: ', round(begin - end, 2), 's', '\n', sep='')

    print('\nEND data modification attack.')
    print(
        '###################################################################')
    print()

예제 #11

0

파일 보기

    prec = metrics.precision_score(y_true, y_pred)
    rec = metrics.recall_score(y_true, y_pred)
    return "accuracy: {0} \n precision: {1} \n recall: {2}\n".format(
        acc, prec, rec)


def get_evasion_set(x_test, y_pred):
    # for x, y in zip(x_test, y_pred):
    #     print("true label: {0}, predicted label: {1}".format(x.label, y))
    ls = [x for x, y in zip(x_test, y_pred) if x.label == 1 and y == 1]
    print("{0} malicious instances are being detected initially")
    return ls, [x.label for x in ls]


dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                       binary=False,
                       raw=True)
training_, testing_ = dataset.split({'train': 86, 'test': 14})
training_data = load_dataset(training_)
testing_data = load_dataset(testing_)
test_true_label = [x.label for x in testing_data]

print("training data shape:" + str(dataset.shape))
print("num of training data:" + str(len(training_data)))
# test simple learner svm
learner1 = L_infSVM(training_data, 0)
learner1.train()

predictions = learner1.predict(testing_data)
print(summary(predictions, test_true_label))

예제 #12

0

파일 보기

def test_k_insertion():
    """
    Use as follows:
    python3 adlib/tests/adversaries/k_insertion_test.py NUMBER-TO-ADD
    """

    print()
    print('###################################################################')
    print('START k-insertion attack.\n')

    begin = time.time()

    # Data processing unit
    # The path is an index of 400 testing samples(raw email data).
    dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                           binary=False, raw=True)
    training_data, predict_data = dataset.split({'train': 20, 'test': 80})
    training_data = load_dataset(training_data)
    predict_data = load_dataset(predict_data)

    print('Training sample size: ', len(training_data), '/400\n', sep='')

    if len(sys.argv) > 2:
        number_to_add = int(sys.argv[1])
    else:
        number_to_add = int(0.25 * len(training_data))

    # Setting the default learner
    # Test simple learner svm
    learning_model = svm.SVC(probability=True, kernel='linear')
    learner = SimpleLearner(learning_model, training_data)
    learner.train()

    original_pred_labels = learner.predict(training_data)
    before_attack_label = original_pred_labels[0]
    orig_learner = deepcopy(learner)

    # Do the attack
    attacker = KInsertion(learner,
                          training_data[0],
                          number_to_add=number_to_add,
                          verbose=True)

    attack_data = attacker.attack(training_data)

    # Retrain the model with poisoned data
    learning_model = svm.SVC(probability=True, kernel='linear')
    learner = SimpleLearner(learning_model, attack_data)
    learner.train()

    print('Number of added instances: ', len(attack_data) - len(training_data))

    ############################################################################
    # Calculate statistics with training data

    attack_pred_labels = learner.predict(training_data)  # predict w/ orig label
    after_attack_label = attack_pred_labels[0]

    (orig_precent_correct,
     attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 training_data)

    print('###################################################################')
    print('Predictions with training dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    ############################################################################
    # Calculate statistics with predict data (other half of dataset)

    original_pred_labels = orig_learner.predict(predict_data)
    attack_pred_labels = learner.predict(predict_data)

    (orig_precent_correct,
     attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 predict_data)

    print('###################################################################')
    print('Predictions with other half of dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    ############################################################################
    # Calculate statistics with predict data (other half of dataset)

    print('###################################################################')
    print('Selected instance true label: ', training_data[0].get_label())
    print('Selected instance predicted label BEFORE attack: ',
          before_attack_label)
    print('Selected instance predicted label AFTER attack: ',
          after_attack_label)

    ############################################################################
    # Output loss calculations

    print('###################################################################')
    print('poison_instance loss before attack: ',
          round(attacker.poison_loss_before, 4))
    print('poison_instance loss after attack: ',
          round(attacker.poison_loss_after, 4))
    print('poison_instance loss difference: ',
          round(attacker.poison_loss_after - attacker.poison_loss_before, 4))

    end = time.time()
    print('\nTotal time: ', round(end - begin, 2), 's', '\n', sep='')

    print('\nEND k-insertion attack.')
    print('###################################################################')
    print()

예제 #13

0

파일 보기

파일: label_flipping_test.py 프로젝트: vishalbelsare/adlib

def test_label_flipping():
    print()
    print(
        '###################################################################')
    print('START label flipping attack.\n')

    begin = time.time()

    # Data processing unit
    # The path is an index of 400 testing samples(raw email data).
    dataset = EmailDataset(path='./data_reader/data/raw/trec05p-1/test-400',
                           binary=False,
                           raw=True)

    training_data, predict_data = dataset.split({'train': 25, 'test': 75})
    training_data = load_dataset(training_data)
    predict_data = load_dataset(predict_data)

    print('Training sample size: ', len(training_data), '/400\n', sep='')

    # Setting the default learner
    # Test simple learner svm
    learning_model = svm.SVC(probability=True, kernel='linear')
    learner = SimpleLearner(learning_model, training_data)
    learner.train()
    orig_learner = deepcopy(learner)

    # Execute the attack
    cost = list(np.random.binomial(2, 0.5, len(training_data)))
    total_cost = 0.3 * len(training_data)  # flip around ~30% of the labels
    attacker = LabelFlipping(learner, cost, total_cost, verbose=True)
    attack_data = attacker.attack(training_data)

    flip_vector = []  # 0 -> flipped, 1 -> not flipped
    for i in range(len(attack_data)):
        if attack_data[i].get_label() != training_data[i].get_label():
            flip_vector.append(0)
        else:
            flip_vector.append(1)

    print('Flip vector with 0 -> flipped and 1 -> not flipped: \n',
          np.array(flip_vector), '\n')

    original_pred_labels = learner.predict(training_data)

    # Retrain the model with poisoned data
    learning_model = svm.SVC(probability=True, kernel='linear')
    learner = SimpleLearner(learning_model, attack_data)
    learner.train()

    ############################################################################
    # Calculate statistics with training data

    attack_pred_labels = learner.predict(
        training_data)  # predict w/ orig label

    (orig_precent_correct, attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 training_data)

    print(
        '###################################################################')
    print('Predictions with training dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    ############################################################################
    # Calculate statistics with predict data (other half of dataset)

    original_pred_labels = orig_learner.predict(predict_data)
    attack_pred_labels = learner.predict(predict_data)

    (orig_precent_correct, attack_precent_correct,
     difference) = calculate_correct_percentages(original_pred_labels,
                                                 attack_pred_labels,
                                                 predict_data)

    print(
        '###################################################################')
    print('Predictions with other half of dataset:')
    print('Original correct percentage: ', orig_precent_correct, '%')
    print('Attack correct percentage: ', attack_precent_correct, '%')
    print('Difference: ', difference, '%')

    end = time.time()
    print('\nTotal time: ', round(end - begin, 2), 's', '\n', sep='')

    print('\nEND label flipping attack.')
    print(
        '###################################################################')
    print()