def sometrials(options): ''' Runs a single machine learning trial. ''' # Load the data data = mldata.load_data(options.database) # Preprocess data # TODO: fill in this part # If specified, output the current database if(options.exportdb != None): mldata.save_data(data, options.exportdb) # Extract the basic data from the data features, labels, _, featnames = mldata.data_components(data) # Get the seeds for the splits. numsplits = 30 # Make this an option later, if need be. seeds = mldata.gen_seeds(options.seed, numsplits) # Generate the splits # For now, training data will compose 90% of each split. # Possibly make the an option later. tr_te_sizes = [int(round(0.9*options.numsamples)), \ options.numsamples-int(round(0.9*options.numsamples))] splits = mldata.gen_splits(seeds, labels, tr_te_sizes, options.malfrac) # Start printing the results printparams(options) mlstat.print_results_header() # make the fbeta scoring object scorer = make_scorer(fbeta_score, beta=options.beta) # Fit and score based on the various performance measures perfmeasures = ['accuracy', 'precision', 'recall', scorer] for perfmeasure in perfmeasures: score_on_splits(perfmeasure, options, features, labels, featnames, splits) return
''' This script will assist with feature isolation by making new databases with just the isolated feature, the filename, and the label.''' # Author: Zane Markel # Created: 16 SEP 2014 import mldata import numpy as np import argparse clargs = argparse.ArgumentParser() clargs.add_argument('db', type=file, \ help='The database file that you want to split by feature.') clargs.add_argument('outdir', help='directory to output to.') args = clargs.parse_args() db = args.db outdir = args.outdir # Load the data data = mldata.load_data(db) # Get the feature names _, _, _, featurenames = mldata.data_components(data) # For each feature, save a new database with it, the label, and the fname nonfeats = ['isMalware', 'Name'] for feat in featurenames: thisdata = mldata.only_features(data, nonfeats+[feat]) fname = '{}{}.csv'.format(outdir, feat) print fname mldata.save_data(thisdata, open(fname, 'w'))
def oldtrial(options): ''' Run a single machine learning trial.''' # TODO: make option for loading intermediate data to skip steps that have # been done in previous trials # Select data to read data = mldata.load_data(options.database) # Get a sample if(options.numsamples != None): # Check to see if a sample was requested if(options.malfrac != None): sample = mldata.select_sample(int(options.seed), data, \ options.numsamples, options.malfrac[0]) else: # Only use a percent malware if one was specified sample = mldata.select_sample(int(options.seed), data, options.numsamples) else: sample = data # If specified, output the current database if(options.exportdb != None): mldata.save_data(sample, options.exportdb) # Original way to run a trial... probably going to be deleted eventually if(options.acc): return oldacc(options, sample) # Primary way to run a trial else: printparams(options) mlstat.print_results_header() perfmeasures = ['accuracy', 'precision', 'recall', 'f1'] avgs = [] for perfmeasure in perfmeasures: # Extract the parts of the samples # Not yet using the filenames and feature names features, labels, _, featnames = mldata.data_components(sample) # Split the sample into 10 randomly stratified folds cvsplits = cross_validation.StratifiedShuffleSplit(labels, \ test_size=0.1, random_state=options.seed) # Score the folds est = mlalgos.get_estimator(options.algorithm) scores = cross_validation.cross_val_score(est, features, y=labels, \ scoring=perfmeasure, cv=cvsplits) # Print the results avgs.append(sum(scores)/len(scores)) avgstr = '{:.4}'.format(avgs[-1]).rjust(7) resultstr = '{} {} '.format(perfmeasure.rjust(9), avgstr) for score in scores: resultstr += ' {:.3}'.format(score) print(resultstr) # Icing on the cake: draw a decision tree graph # based on the fold with the best f1 score if(perfmeasure=='f1' and options.graphfile != None and \ isinstance(est, tree.DecisionTreeClassifier)): mlalgos.dt_graph(est, cvsplits, scores, features, labels, \ featnames, options.graphfile) return (perfmeasures, avgs)
def handle_idx(idx, data, args, name): '''Computes and prints the malprev, then saves the data where specified ''' print('{} of {} are malicious'.format( \ sum(data[idx]['isMalware']), len(data[idx]))) outname = args.outdir + '{}'.format(name) # name the file mldata.save_data(data[idx], open(outname, 'w')) # save the appropriate data