def sometrials(options): ''' Runs a single machine learning trial. ''' # Load the data data = mldata.load_data(options.database) # Preprocess data # TODO: fill in this part # If specified, output the current database if(options.exportdb != None): mldata.save_data(data, options.exportdb) # Extract the basic data from the data features, labels, _, featnames = mldata.data_components(data) # Get the seeds for the splits. numsplits = 30 # Make this an option later, if need be. seeds = mldata.gen_seeds(options.seed, numsplits) # Generate the splits # For now, training data will compose 90% of each split. # Possibly make the an option later. tr_te_sizes = [int(round(0.9*options.numsamples)), \ options.numsamples-int(round(0.9*options.numsamples))] splits = mldata.gen_splits(seeds, labels, tr_te_sizes, options.malfrac) # Start printing the results printparams(options) mlstat.print_results_header() # make the fbeta scoring object scorer = make_scorer(fbeta_score, beta=options.beta) # Fit and score based on the various performance measures perfmeasures = ['accuracy', 'precision', 'recall', scorer] for perfmeasure in perfmeasures: score_on_splits(perfmeasure, options, features, labels, featnames, splits) return
def oldtrial(options): ''' Run a single machine learning trial.''' # TODO: make option for loading intermediate data to skip steps that have # been done in previous trials # Select data to read data = mldata.load_data(options.database) # Get a sample if(options.numsamples != None): # Check to see if a sample was requested if(options.malfrac != None): sample = mldata.select_sample(int(options.seed), data, \ options.numsamples, options.malfrac[0]) else: # Only use a percent malware if one was specified sample = mldata.select_sample(int(options.seed), data, options.numsamples) else: sample = data # If specified, output the current database if(options.exportdb != None): mldata.save_data(sample, options.exportdb) # Original way to run a trial... probably going to be deleted eventually if(options.acc): return oldacc(options, sample) # Primary way to run a trial else: printparams(options) mlstat.print_results_header() perfmeasures = ['accuracy', 'precision', 'recall', 'f1'] avgs = [] for perfmeasure in perfmeasures: # Extract the parts of the samples # Not yet using the filenames and feature names features, labels, _, featnames = mldata.data_components(sample) # Split the sample into 10 randomly stratified folds cvsplits = cross_validation.StratifiedShuffleSplit(labels, \ test_size=0.1, random_state=options.seed) # Score the folds est = mlalgos.get_estimator(options.algorithm) scores = cross_validation.cross_val_score(est, features, y=labels, \ scoring=perfmeasure, cv=cvsplits) # Print the results avgs.append(sum(scores)/len(scores)) avgstr = '{:.4}'.format(avgs[-1]).rjust(7) resultstr = '{} {} '.format(perfmeasure.rjust(9), avgstr) for score in scores: resultstr += ' {:.3}'.format(score) print(resultstr) # Icing on the cake: draw a decision tree graph # based on the fold with the best f1 score if(perfmeasure=='f1' and options.graphfile != None and \ isinstance(est, tree.DecisionTreeClassifier)): mlalgos.dt_graph(est, cvsplits, scores, features, labels, \ featnames, options.graphfile) return (perfmeasures, avgs)