from utils import split_matrix
from utils import transform_binary_matrix

if __name__ == '__main__':
    # Load and Preprocess Dataset
    rating_df = load_ratings('ratings.csv')
    user2idx, movie2idx = get_user_movie_dictionary(rating_df)
    print(f'# of user: {len(user2idx)}\t# of movie: {len(movie2idx)}')

    rating_matrix, stat =\
        transform_binary_matrix(rating_df, user2idx, movie2idx)
    print(f'Positive Feedback: {stat["pos"]}',
          f'\tNegative Feedback: {stat["neg"]}')

    rating_matrix_train, rating_matrix_val =\
        split_matrix(rating_matrix, user2idx, movie2idx)

    print(f'Train: {rating_matrix_train.count_nonzero()}\t',
          f'Validation Size: {rating_matrix_val.count_nonzero()}')

    # Train Item2Vec Model
    model = Item2Vector(item_dim=len(movie2idx), embedding_dim=100)
    model.train(rating_matrix_train)

    # Make Embeddings
    embeddings = model.get_embeddings()
    print(embeddings.shape)

    # Save Embeddings
    np.savez('./output/embedding.npz', embeddings)
예제 #2
0
def learning_BernoulliGenerator():
    params = {
        'layers': [160, 256, 160],
        'batch size': 128,
        'learning rate': 0.001,
        'training set fraction': 0.75,
        'max epochs': 100,
        'max epochs no improvement': 25,
        'num trials': 10,
    }
    tag = '20161025_pre_train_API_rand_1gram_layers_%s_batch_%d_lr_%g_epo_%d_%d_trials_%d' % (
        '_'.join([str(layer) for layer in params['layers']]),
        params['batch size'],
        params['learning rate'],
        params['max epochs'],
        params['max epochs no improvement'],
        params['num trials']
    )
    dir_path = '../model/' + tag
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    params['model path'] = dir_path + '/model'
    params['log path'] = dir_path + '/log.txt'
    score_template = 'TPR %(TPR)f\tFPR %(FPR)f\tAccuracy %(Accuracy)f\tAUC %(AUC)f'
    D = RandomForrest()
    G = BernoulliGenerator(D, params)
    training_data = np.loadtxt('../data/API_truncation50_random_split_trainval_1gram_feature.csv',
                               delimiter=',', dtype=np.int32)
    test_data = np.loadtxt('../data/API_truncation50_random_split_test_1gram_feature.csv',
                           delimiter=',', dtype=np.int32)
    log_message = str(datetime.now()) + '\tTraining discriminative model on original dataset\n'
    D.train(training_data[:, :-1], training_data[:, -1])
    log_message += str(datetime.now()) + '\tTraining set result\t'
    log_message += score_template % D.evaluate(training_data[:, :-1], training_data[:, -1])
    log_message += '\n' + str(datetime.now()) + '\tTest set result\t'
    log_message += score_template % D.evaluate(test_data[:, :-1], test_data[:, -1])
    with open(params.get('log path'), 'a') as f:
        f.write(log_message + '\n')
    training_data_benign, training_data_malware = split_matrix(training_data)
    test_data_benign, test_data_malware = split_matrix(test_data)
    for i in range(50):
        log_message = str(datetime.now()) + '\tTraining generative model for the %d-th time\n' % (i,)
        #G.train(training_data_malware[:, :-1])
        G.train((training_data_malware[:, :-1], training_data_benign[:, :-1]))
        log_message += str(datetime.now()) + '\tGenerating examples\n'
        generated_training_malware, num_trials_training = G.sample(training_data_malware[:, :-1])
        generated_training_malware = np.concatenate((generated_training_malware,
                                                     training_data_malware[:, -1:]), axis=1)
        generated_training_data = np.concatenate((generated_training_malware, training_data_benign))
        generated_test_malware, num_trials_test = G.sample(test_data_malware[:, :-1])
        generated_test_malware = np.concatenate((generated_test_malware,
                                                 test_data_malware[:, -1:]), axis=1)
        generated_test_data = np.concatenate((generated_test_malware, test_data_benign))
        log_message += str(datetime.now()) + '\tMean number of trials for training and test set: %f, %f\n' % \
                                             (num_trials_training.mean(), num_trials_test.mean())
        log_message += str(datetime.now()) + '\tTraining set result before re-training\t'
        log_message += score_template % D.evaluate(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tTest set result before re-training\t'
        log_message += score_template % D.evaluate(generated_test_data[:, :-1], generated_test_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tRe-training discriminative model\n'
        D.train(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += str(datetime.now()) + '\tTraining set result after re-training\t'
        log_message += score_template % D.evaluate(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tTest set result after re-training\t'
        log_message += score_template % D.evaluate(generated_test_data[:, :-1], generated_test_data[:, -1])
        with open(params.get('log path'), 'a') as f:
            f.write(log_message + '\n\n')
예제 #3
0
def learning_MalGAN(D_name='RF', data_fraction='0.1', diff_data='0'):
    params = {
        'G layers': [160, 256, 160],
        'noise dim': 10,
        'malware batch size': 128,
        'L2': 0.0,
        'learning rate': 0.001,
        'training set fraction': 0.75,
        'D layers': [160, 256, 1],
        'batch size': 256,
        'regularization D layers': [160, 256, 1],
        'regu coef': 0.0,
        'max epochs': 200,
        'max epochs no improvement': 25,
        'num trials': 1
    }
#    tag = '20171013_%sMalGan_%s_drebin_%s_G_layers_%s_noise_%d_mal_batch_%d_L2_%g_' \
#          'D_layers_%s_batch_%d_regu_D_layers_%s_coef_%g_lr_%g_epoch_%d_%d_trials_%d' % (
#        '' if diff_data is '0' else 'diff_data_',
#        D_name,
#        data_fraction,
#        '_'.join([str(layer) for layer in params['G layers']]),
#        params['noise dim'],
#        params['malware batch size'],
#        params['L2'],
#        '_'.join([str(layer) for layer in params['D layers']]),
#        params['batch size'],
#        '_'.join([str(layer) for layer in params['regularization D layers']]),
#        params['regu coef'],
#        params['learning rate'],
#        params['max epochs'],
#        params['max epochs no improvement'],
#        params['num trials']
#    )
    tag = '20171026_WGAN'
    dir_path = '../result/' + tag
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    if os.path.exists(os.path.join(dir_path, 'code')):
        shutil.rmtree(os.path.join(dir_path, 'code'))
    shutil.copytree('./', os.path.join(dir_path, 'code'))

    params['model path'] = dir_path + '/model'
    params['log path'] = dir_path + '/log.txt'
    score_template = 'TPR %(TPR)f\tFPR %(FPR)f\tAccuracy %(Accuracy)f\tAUC %(AUC)f'
    if D_name is 'RF':
        D = RandomForest()
    elif D_name is 'GBDT':
        D = GBDT()
    elif D_name is 'LR':
        D = LR()
    elif D_name is 'DT':
        D = DT()
    elif D_name is 'NB':
        D = NB()
    elif D_name is 'SVM':
        D = SVM()
    elif D_name is 'MLP':
        D = MLP()
    elif D_name is 'KNN':
        D = KNN()
    else:
        D = VOTE()

    G = MalGAN(D, params)
    training_data = np.loadtxt('../data/API_truncation50_random_split_trainval_1gram_feature.csv',
                               delimiter=',', dtype=np.int32)
    test_data = np.loadtxt('../data/API_truncation50_random_split_test_1gram_feature.csv',
                           delimiter=',', dtype=np.int32)
    log_message = str(datetime.now()) + '\tnow using ' + D_name + ' as Discrimibator\n'
    log_message += str(datetime.now()) + '\tTraining discriminative model on original dataset\n'
    if diff_data is '0':
        D.train(training_data[:, :-1], training_data[:, -1])
    else:
        D.train(training_data[:len(training_data) / 2, :-1], training_data[:len(training_data) / 2, -1])
        training_data = training_data[len(training_data) / 2:, :]
    log_message += str(datetime.now()) + '\tTraining set result\t'
    log_message += score_template % D.evaluate(training_data[:, :-1], training_data[:, -1])
    log_message += '\n' + str(datetime.now()) + '\tTest set result\t'
    log_message += score_template % D.evaluate(test_data[:, :-1], test_data[:, -1])
    with open(params.get('log path'), 'a') as f:
        f.write(log_message + '\n')
    training_data_benign, training_data_malware = split_matrix(training_data)
    test_data_benign, test_data_malware = split_matrix(test_data)
    for i in range(1):
        log_message = str(datetime.now()) + '\tTraining generative model for the %d-th time\n' % (i,)
        G.train((training_data_malware[:, :-1], training_data_benign[:, :-1]))
        log_message += str(datetime.now()) + '\tGenerating examples\n'
        generated_training_malware, num_trials_training = G.sample(training_data_malware[:, :-1])
        generated_training_malware = np.concatenate((generated_training_malware,
                                                     training_data_malware[:, -1:]), axis=1)
        generated_training_data = np.concatenate((generated_training_malware, training_data_benign))
        generated_test_malware, num_trials_test = G.sample(test_data_malware[:, :-1])
        generated_test_malware = np.concatenate((generated_test_malware,
                                                 test_data_malware[:, -1:]), axis=1)
        generated_test_data = np.concatenate((generated_test_malware, test_data_benign))
        log_message += str(datetime.now()) + '\tMean number of trials for training and test set: %f, %f\n' % \
                                             (num_trials_training.mean(), num_trials_test.mean())
        log_message += str(datetime.now()) + '\tTraining set result before re-training\t'
        log_message += score_template % D.evaluate(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tTest set result before re-training\t'
        log_message += score_template % D.evaluate(generated_test_data[:, :-1], generated_test_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tRe-training discriminative model\n'
        D.train(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += str(datetime.now()) + '\tTraining set result after re-training\t'
        log_message += score_template % D.evaluate(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tTest set result after re-training\t'
        log_message += score_template % D.evaluate(generated_test_data[:, :-1], generated_test_data[:, -1])
        with open(params.get('log path'), 'a') as f:
            f.write(log_message + '\n\n')
예제 #4
0
def genearting_adversarial_examples():
    params = {
        'learning rate': 0.001,
        'training set fraction': 0.75,
        'D layers': [44942, 200, 200, 1],
        'batch size': 256,
        'max epochs': 1000,
        'max epochs no improvement': 10,
        'num trials': 1000,
    }
    tag = '20170909_AdvExam_drebin_D_layers_%s_batch_%d_lr_%g_epoch_%d_%d_trials_%d' % (
        '_'.join([str(layer) for layer in params['D layers']]),
        params['batch size'],
        params['learning rate'],
        params['max epochs'],
        params['max epochs no improvement'],
        params['num trials']
    )
    dir_path = '../model/' + tag
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    if os.path.exists(os.path.join(dir_path, 'code')):
        shutil.rmtree(os.path.join(dir_path, 'code'))
    shutil.copytree('.', os.path.join(dir_path, 'code'))

    params['model path'] = dir_path + '/model'
    params['log path'] = dir_path + '/log.txt'
    score_template = 'TPR %(TPR)f\tFPR %(FPR)f\tAccuracy %(Accuracy)f\tAUC %(AUC)f'
    D = RandomForrest()
    G = AdversarialExamples(D, params)
    training_data = np.loadtxt('../data/drebin/drebin_train_0.5.csv',
                               delimiter=',', dtype=np.int32)
    test_data = np.loadtxt('../data/drebin/drebin_test_0.5.csv',
                           delimiter=',', dtype=np.int32)
    log_message = str(datetime.now()) + '\tTraining discriminative model on original dataset\n'
    D.train(training_data[:, :-1], training_data[:, -1])
    log_message += str(datetime.now()) + '\tTraining set result\t'
    log_message += score_template % D.evaluate(training_data[:, :-1], training_data[:, -1])
    log_message += '\n' + str(datetime.now()) + '\tTest set result\t'
    log_message += score_template % D.evaluate(test_data[:, :-1], test_data[:, -1])
    with open(params.get('log path'), 'a') as f:
        f.write(log_message + '\n')
    training_data_benign, training_data_malware = split_matrix(training_data)
    test_data_benign, test_data_malware = split_matrix(test_data)
    for i in range(1):
        log_message = str(datetime.now()) + '\tTraining generative model for the %d-th time\n' % (i,)
        G.train((training_data_malware[:, :-1], training_data_benign[:, :-1]))
        log_message += str(datetime.now()) + '\tGenerating examples\n'
        generated_training_malware, num_trials_training = G.sample(training_data_malware[:, :-1])
        generated_training_malware = np.concatenate((generated_training_malware,
                                                     training_data_malware[:, -1:]), axis=1)
        generated_training_data = np.concatenate((generated_training_malware, training_data_benign))
        generated_test_malware, num_trials_test = G.sample(test_data_malware[:, :-1])
        generated_test_malware = np.concatenate((generated_test_malware,
                                                 test_data_malware[:, -1:]), axis=1)
        generated_test_data = np.concatenate((generated_test_malware, test_data_benign))
        log_message += str(datetime.now()) + '\tMean number of trials for training and test set: %f, %f\n' % \
                                             (num_trials_training.mean(), num_trials_test.mean())
        log_message += str(datetime.now()) + '\tTraining set result before re-training\t'
        log_message += score_template % D.evaluate(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tTest set result before re-training\t'
        log_message += score_template % D.evaluate(generated_test_data[:, :-1], generated_test_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tRe-training discriminative model\n'
        D.train(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += str(datetime.now()) + '\tTraining set result after re-training\t'
        log_message += score_template % D.evaluate(generated_training_data[:, :-1], generated_training_data[:, -1])
        log_message += '\n' + str(datetime.now()) + '\tTest set result after re-training\t'
        log_message += score_template % D.evaluate(generated_test_data[:, :-1], generated_test_data[:, -1])
        with open(params.get('log path'), 'a') as f:
            f.write(log_message + '\n\n')