def main(): ''' Test the functionality of splitting malprevs by taking a database and splitting into the splits. ''' clparse = argparse.ArgumentParser('Test malprev splitting functionality.') clparse.add_argument("db", type=file, \ help='.csv containing the database to test with') clparse.add_argument("outdir", help='directory to output to.') args = clparse.parse_args() data = mldata.load_data(args.db) feat, lab, _, _ = mldata.data_components(data) seeds = mldata.gen_seeds(42, 3) # Split the data twice. This is a proof of concept, so don't # worry that you're hardcoding the numsamples and the malprevs splits = mldata.gen_splits(seeds, lab, [9000,1000], [0.5, 0.1]) # This parallels how the iteration works in cross_validation.cross_val_score cnt = 0 for tr_idx, te_idx in splits: # Training data handle_idx(tr_idx, data, args, 'tr{}'.format(cnt)) # Test data handle_idx(te_idx, data, args, 'te{}'.format(cnt)) cnt += 1
def sometrials(options): ''' Runs a single machine learning trial. ''' # Load the data data = mldata.load_data(options.database) # Preprocess data # TODO: fill in this part # If specified, output the current database if(options.exportdb != None): mldata.save_data(data, options.exportdb) # Extract the basic data from the data features, labels, _, featnames = mldata.data_components(data) # Get the seeds for the splits. numsplits = 30 # Make this an option later, if need be. seeds = mldata.gen_seeds(options.seed, numsplits) # Generate the splits # For now, training data will compose 90% of each split. # Possibly make the an option later. tr_te_sizes = [int(round(0.9*options.numsamples)), \ options.numsamples-int(round(0.9*options.numsamples))] splits = mldata.gen_splits(seeds, labels, tr_te_sizes, options.malfrac) # Start printing the results printparams(options) mlstat.print_results_header() # make the fbeta scoring object scorer = make_scorer(fbeta_score, beta=options.beta) # Fit and score based on the various performance measures perfmeasures = ['accuracy', 'precision', 'recall', scorer] for perfmeasure in perfmeasures: score_on_splits(perfmeasure, options, features, labels, featnames, splits) return