def file_stats(): mls = ['dt', 'rf'] # , 'svm', 'nb'] for window in windows: writer = pytablewriter.MarkdownTableWriter() writer.table_name = 'File Accuracy for {}s'.format(window) writer.header_list = ['File', 'Decision Tree', 'Random Forest', 'Tensorflow'] value_matrix = [] for name in binet_files: values = [name] feature, label = get_feature_labels(get_saved_data(window, name, v2=True), v2=True) # feature = mask_features(feature) feat_train, feat_test, label_train, label_test = train_test_split( feature, label, test_size=0.3, random_state=42) for ml in mls: r = train_and_test_with(feat_train, label_train, ml, feat_test, label_test) values.append('{0:.4f}, {1:.4f}, {2:.4f}'.format( r['accuracy'], r['precision'], r['recall'])) print(values) correctness, precision, recall = \ keras_train_and_test(feat_train, label_train, feat_test, label_test, dimension=22) values.append('{0:.4f}, {1:.4f}, {2:.4f}'.format(correctness, precision, recall)) print(values) value_matrix.append(values) writer.value_matrix = value_matrix writer.write_table()
def get_balance(): for binet in binet_files: summary = get_saved_data(0.15, binet) _, label = get_feature_labels(summary) attacks = sum(label) nonattacks = len(label) - attacks print("{} | {} ".format(attacks, nonattacks))
def shuffle_data_test(): binet = binet_files[-1] feature, label = get_feature_labels(get_saved_data(0.15, binet)) scores = [] precs = [] rec = [] # do normal scoring # TODO: do same analysis with dt and rf acc, p, r = keras_train_and_test(feature, label) scores.append(acc) precs.append(p) rec.append(r) mstd = list(get_mean_std(feature)) for i in range(1, 5): indices = [random.randrange(len(feature)) for _ in range( int(len(feature) * ((i*10)/100)))] f = feature[:] for index in indices: f[index] = [np.random.normal(*mstd[i]) for i in range(len(f[index]))] acc, p, r = keras_train_and_test(f, label) scores.append(acc) precs.append(p) rec.append(r) plt.figure() plt.plot(scores, color='lightblue', label='Accuracy') plt.plot(precs, color='red', label='precision') plt.plot(rec, color='green', label='recall') plt.ylabel("Score") plt.xlabel("\% of features randomized") plt.title("Score randomizing") plt.legend(loc='best') plt.show()
def feature_plotting(): feature, label = get_feature_labels(get_saved_data(0.15, binet_files[12])) plt.figure() zeroes = set(zip(range(len(feature)), feature[:,9])) ones = set(z for z in zeroes if label[z[0]] == 1) del label del feature zeroes = zeroes.difference(ones) plt.scatter(*zip(*zeroes), s=1, c='gray') del zeroes plt.scatter(*zip(*ones),s=10, c='lightgreen') plt.show()
def stats_on_best(): best = [8, 9, 12] summaries = [] for b in best: summaries += get_saved_data(0.15, binet_files[b]) feature, label = get_feature_labels(summaries) scores = [] for i in range(1, 5): feature = [[random.randrange(-(i*10), i*10) for f in feat] for feat in feature] acc, _, _ = keras_train_and_test(feature, label) scores.append(acc) print(scores)
def kfold_test(): mls = ['dt', 'rf'] for window in windows: writer = pytablewriter.MarkdownTableWriter() writer.table_name = 'KFold validation' writer.header_list = ['File', 'Decision Tree', 'Random Forest', 'Tensorflow'] value_matrix = [] for name in binet_files: values = [name] feature, label = get_feature_labels(get_saved_data(window, name)) feature = feature[:int(len(feature) * 10)] label = feature[:int(len(label) * 10)] kf = KFold(n_splits=10) # feature = mask_features(feature) for ml in mls: scores = [] pr_scores = [] for train, test in kf.split(feature): clf = get_classifier(ml) xtrain, ytrain = feature[train], label[train] xtest, ytest = feature[test], label[test] clf.fit(xtrain, ytrain) test_predicts = clf.predict(xtest) test_score = accuracy_score(ytest, test_predicts) scores.append(test_score) proba = clf.predict_proba(xtest) precision, recall, pr_thresholds = precision_recall_curve( ytest, proba[:, 1]) pr_scores.append(auc(recall, precision)) values.append('{0:.4f}, {1:.4f}, {2:.4f}, {3:.4f}'.format( np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores))) kf = KFold(n_splits=10) accuracy = [] # , precision, recall = [], [], [] for train_index, test_index in kf.split(feature): x_train, x_test = feature[train_index], feature[test_index] y_train, y_test = label[train_index], label[test_index] c, p, r = \ keras_train_and_test(x_train, y_train, x_test, y_test, dimension=12) accuracy.append(c) # precision.append(p) # recall.append(r) values.append('{0:.4f}, {1:.4f}'.format(np.mean(accuracy), np.std(accuracy))) value_matrix.append(values) writer.value_matrix = value_matrix writer.write_table()
def window_shift(window): writer = pytablewriter.MarkdownTableWriter() writer.table_name = 'Window Shift Accuracy for {}s'.format(window) writer.header_list = ['File', 'Descision Tree', 'Random Forest'] value_matrix = [] for file_name in binet_files: values = [] feature, label = get_feature_labels( get_saved_data(window, file_name)) feature = mask_features(feature) values += [ file_name, '{0:.4f}'.format(train_and_test_step(feature, label, 'dt', 1000)), '{0:.4f}'.format(train_and_test_step(feature, label, 'rf', 1000))] values.append( '{0:.4f}'.format(train_and_test_step(feature, label, 'tf', 1000))) value_matrix.append(values) writer.value_matrix = value_matrix writer.write_table()
def run_analysis_with(interval, file_name, start_time=None, use_pickle=True): if start_time is None: start_time = get_start_time_for(file_name) start = datetime.strptime(start_time, TIME_FORMAT) file_num = get_file_num(file_name) directory = 'runs_of_%ss/' % interval if not os.path.exists(directory): os.makedirs(directory) mls = ['dt', 'rf'] print('starting %d %s' % (interval, file_name)) if use_pickle: print('loading pickle') summaries = get_saved_data(interval, file_name) if summaries is None: print('failed to load pickle. Aggregating data') summaries = aggregate_file(interval, file_name, start) print('finished aggregating, pickling data...') pickle_summarized_data(interval, start_time, file_name, summaries) print('data pickled') else: print('loaded picke') else: print('aggregating data') summaries = aggregate_file(interval, file_name, start) print('finished aggregating, pickling data...') pickle_summarized_data(interval, start_time, file_name, summaries) print('data pickled') features, labels = get_feature_labels(summaries) for ml in mls: print('testing with %s' % ml) result = train_and_test_with(features, labels, ml) path = '%srun_%s_%s.txt' % (directory, file_num, ml) save_results(path, file_name, start_time, interval, result)