def main(): dataset = data.load_eviction(dataset_type=DATASET_TYPE) losses = [] accs = [] precs = [] recs = [] with open(FILENAME, 'w+') as f: for i in range(NUM_TESTS): print('*' * 80) print('Round %s' % i) dataset.split() train(dataset) predictions = predict(dataset) loss, acc, prec, rec = evaluate(dataset.y_test, predictions) losses.append(loss) accs.append(acc) precs.append(prec) recs.append(rec) loss_avg = np.array(losses).mean() accs_avg = np.array(accs).mean() precs_avg = np.array(precs).mean() recs_avg = np.array(recs).mean() print('Loss average: %s' % loss_avg) print('Accuracy average: %s' % accs_avg) print('Precision average: %s' % precs_avg) print('Recall average: %s' % recs_avg) output = '%s, %s, %s, %s, %s\n' % ('nn', loss_avg, accs_avg, precs_avg, recs_avg) f.write(output)
"""Preprocessing script to find most stable features, i.e. those that are selective across many randomized trials. """ import pandas as pd from sklearn.linear_model import RandomizedLogisticRegression import data if __name__ == '__main__': dataset = data.load_eviction() descriptions = pd.read_pickle('data/private/feature_codes_to_names.pck') print('Data loaded.') rlogistic = RandomizedLogisticRegression(normalize=True) rlogistic.fit(dataset.X_train, dataset.y_train) print('Model fitted.') features = sorted( zip(map(lambda x: round(x, 4), rlogistic.scores_), dataset.X_train.columns)) print('Number of features:\t\t%s' % len(features)) nonzero_features = [(score, code) for score, code in features if score > 0] print('Number of nonzero features:\t%s' % len(nonzero_features)) columns = [] for score, code in nonzero_features: columns.append(code) print('-' * 80)
"""Predict eviction response variable. """ import data from predict import utils from sklearn.svm import OneClassSVM if __name__ == '__main__': dataset_pos = data.load_pos_eviction() dataset_neg = data.load_neg_eviction() dataset_all = data.load_eviction() # nu: The proportion of outliers we expect in our data. model_pos = OneClassSVM(kernel='linear', nu=0.9) model_pos.fit(dataset_pos.X_train) model_neg = OneClassSVM(kernel='linear', nu=0.1) model_neg.fit(dataset_neg.X_train) predictions_pos = model_pos.predict(dataset_all.X_train) predictions_neg = model_neg.predict(dataset_all.X_train) # +1 is inlier, -1 is outlier. We want those who are evicted, to be +1 # and those who are not evicted to be 0. # Outliers, those evicted, to be 1. predictions_neg = (predictions_neg == -1).astype(int) # Inliers, those evicted, to be 1. predictions_pos = (predictions_pos == 1).astype(int)
def handcrafted(): dataset = data.load_eviction(dataset_type='handcrafted') fname = '%s/results_HANDCRAFTED_dataset_OVER.csv' % DIR test_runner.predict(MODELS, dataset, fname, N_TESTS, OVERSAMPLE)
def rlogistic(): dataset = data.load_eviction(dataset_type='rlogistic') fname = '%s/results_RLOGISTIC_dataset_OVER.csv' % DIR test_runner.predict(MODELS, dataset, fname, N_TESTS, OVERSAMPLE)
def pca(): dataset = data.load_eviction(dataset_type='pca') fname = '%s/results_PCA_dataset_OVER.csv' % DIR test_runner.predict(MODELS, dataset, fname, N_TESTS, OVERSAMPLE)
def full(): dataset = data.load_eviction() fname = '%s/results_FULL_dataset_OVER.csv' % DIR test_runner.predict(MODELS, dataset, fname, N_TESTS, OVERSAMPLE)