Exemplo n.º 1
0
def sometrials(options):
    ''' Runs a single machine learning trial. '''
    
    # Load the data
    data = mldata.load_data(options.database)

    # Preprocess data
    # TODO: fill in this part

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(data, options.exportdb)

    # Extract the basic data from the data
    features, labels, _, featnames = mldata.data_components(data)

    # Get the seeds for the splits.
    numsplits = 30 # Make this an option later, if need be.
    seeds = mldata.gen_seeds(options.seed, numsplits)

    # Generate the splits
    # For now, training data will compose 90% of each split.
    # Possibly make the an option later.
    tr_te_sizes = [int(round(0.9*options.numsamples)), \
        options.numsamples-int(round(0.9*options.numsamples))]
    splits = mldata.gen_splits(seeds, labels, tr_te_sizes, options.malfrac) 

    # Start printing the results
    printparams(options)
    mlstat.print_results_header()

    # make the fbeta scoring object
    scorer = make_scorer(fbeta_score, beta=options.beta)

    # Fit and score based on the various performance measures
    perfmeasures = ['accuracy', 'precision', 'recall', scorer]
    for perfmeasure in perfmeasures:
        score_on_splits(perfmeasure, options, features, labels, featnames, splits)

    return
Exemplo n.º 2
0
''' This script will assist with feature isolation by making new databases with
just the isolated feature, the filename, and the label.'''
# Author: Zane Markel
# Created: 16 SEP 2014

import mldata
import numpy as np
import argparse

clargs = argparse.ArgumentParser()
clargs.add_argument('db', type=file, \
    help='The database file that you want to split by feature.')
clargs.add_argument('outdir', help='directory to output to.')
args = clargs.parse_args()
db = args.db
outdir = args.outdir

# Load the data
data = mldata.load_data(db)

# Get the feature names
_, _, _, featurenames = mldata.data_components(data)

# For each feature, save a new database with it, the label, and the fname
nonfeats = ['isMalware', 'Name']
for feat in featurenames:
    thisdata = mldata.only_features(data, nonfeats+[feat])
    fname = '{}{}.csv'.format(outdir, feat)
    print fname
    mldata.save_data(thisdata, open(fname, 'w'))
Exemplo n.º 3
0
def oldtrial(options):
    ''' Run a single machine learning trial.''' 
    # TODO: make option for loading intermediate data to skip steps that have
    # been done in previous trials

    # Select data to read
    data = mldata.load_data(options.database)

    # Get a sample
    if(options.numsamples != None): # Check to see if a sample was requested
        if(options.malfrac != None):
            sample = mldata.select_sample(int(options.seed), data, \
                options.numsamples, options.malfrac[0])
        else: # Only use a percent malware if one was specified
            sample = mldata.select_sample(int(options.seed), data,
                options.numsamples)
    else:
        sample = data

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(sample, options.exportdb)

    # Original way to run a trial... probably going to be deleted eventually
    if(options.acc):
        return oldacc(options, sample)

    # Primary way to run a trial
    else:
        printparams(options)
        mlstat.print_results_header()
        perfmeasures = ['accuracy', 'precision', 'recall', 'f1']
        avgs = []
        for perfmeasure in perfmeasures:
            # Extract the parts of the samples
            # Not yet using the filenames and feature names
            features, labels, _, featnames = mldata.data_components(sample)

            # Split the sample into 10 randomly stratified folds
            cvsplits = cross_validation.StratifiedShuffleSplit(labels, \
                        test_size=0.1, random_state=options.seed)

            # Score the folds
            est = mlalgos.get_estimator(options.algorithm)
            scores = cross_validation.cross_val_score(est, features, y=labels, \
                        scoring=perfmeasure, cv=cvsplits)

            # Print the results
            avgs.append(sum(scores)/len(scores))
            avgstr = '{:.4}'.format(avgs[-1]).rjust(7)
            resultstr = '{}  {} '.format(perfmeasure.rjust(9), avgstr)
            for score in scores:
                resultstr += ' {:.3}'.format(score)
            print(resultstr)

            # Icing on the cake: draw a decision tree graph
            # based on the fold with the best f1 score
            if(perfmeasure=='f1' and options.graphfile != None and \
                isinstance(est, tree.DecisionTreeClassifier)):
                mlalgos.dt_graph(est, cvsplits, scores, features, labels, \
                                featnames, options.graphfile)

        return (perfmeasures, avgs)
Exemplo n.º 4
0
def handle_idx(idx, data, args, name):
    '''Computes and prints the malprev, then saves the data where specified '''
    print('{} of {} are malicious'.format( \
        sum(data[idx]['isMalware']), len(data[idx])))
    outname = args.outdir + '{}'.format(name) # name the file
    mldata.save_data(data[idx], open(outname, 'w')) # save the appropriate data