示例#1
0
def main():
    ''' Test the functionality of splitting malprevs by taking a database and
    splitting into the splits. '''

    clparse = argparse.ArgumentParser('Test malprev splitting functionality.')
    clparse.add_argument("db", type=file, \
        help='.csv containing the database to test with')
    clparse.add_argument("outdir", help='directory to output to.')
    args = clparse.parse_args()

    data = mldata.load_data(args.db)
    feat, lab, _, _ = mldata.data_components(data)

    seeds = mldata.gen_seeds(42, 3)

    # Split the data twice. This is a proof of concept, so don't
    # worry that you're hardcoding the numsamples and the malprevs
    splits = mldata.gen_splits(seeds, lab, [9000,1000], [0.5, 0.1])

    # This parallels how the iteration works in cross_validation.cross_val_score
    cnt = 0
    for tr_idx, te_idx in splits:
        # Training data
        handle_idx(tr_idx, data, args, 'tr{}'.format(cnt))
        # Test data
        handle_idx(te_idx, data, args, 'te{}'.format(cnt))
        cnt += 1
示例#2
0
def oldacc(options, sample):
    ''' The simpleAcc way of running a trial ''' 
    # Get the final components from the data
    print("Building train and test data...")

    # Just partition it 80-20 training-testing
    trainsize = 0.8 * len(sample)
    train = sample[0:trainsize]
    test = sample[trainsize:]
    trfeat, trlab, _, _ = mldata.data_components(train)
    tefeat, telab, _, _ = mldata.data_components(test)

    # Check for valid learning algorithm
    if(not mlalgos.validate_algo(options.algorithm)):
        print("Invalid learning algorithm %s" % (options.algorithm))
        sys.exit(1)

    # Train
    print("Training...")
    model = mlalgos.learn(options.algorithm, trfeat, trlab, options.seed)

    # Test
    print("Testing...")
    preds = mlalgos.predict(model, tefeat)

    # Analyze -- FScore, acc, learning curves
    print("Results:")
    printparams(options) # CL options

    # Model parameters
    pprint(vars(model))

    accuracy = mlstat.acc(preds, telab)
    print("Accuracy: %f" % (accuracy))

    return (['accuracy'], [accuracy])
示例#3
0
def sometrials(options):
    ''' Runs a single machine learning trial. '''
    
    # Load the data
    data = mldata.load_data(options.database)

    # Preprocess data
    # TODO: fill in this part

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(data, options.exportdb)

    # Extract the basic data from the data
    features, labels, _, featnames = mldata.data_components(data)

    # Get the seeds for the splits.
    numsplits = 30 # Make this an option later, if need be.
    seeds = mldata.gen_seeds(options.seed, numsplits)

    # Generate the splits
    # For now, training data will compose 90% of each split.
    # Possibly make the an option later.
    tr_te_sizes = [int(round(0.9*options.numsamples)), \
        options.numsamples-int(round(0.9*options.numsamples))]
    splits = mldata.gen_splits(seeds, labels, tr_te_sizes, options.malfrac) 

    # Start printing the results
    printparams(options)
    mlstat.print_results_header()

    # make the fbeta scoring object
    scorer = make_scorer(fbeta_score, beta=options.beta)

    # Fit and score based on the various performance measures
    perfmeasures = ['accuracy', 'precision', 'recall', scorer]
    for perfmeasure in perfmeasures:
        score_on_splits(perfmeasure, options, features, labels, featnames, splits)

    return
示例#4
0
''' This script will assist with feature isolation by making new databases with
just the isolated feature, the filename, and the label.'''
# Author: Zane Markel
# Created: 16 SEP 2014

import mldata
import numpy as np
import argparse

clargs = argparse.ArgumentParser()
clargs.add_argument('db', type=file, \
    help='The database file that you want to split by feature.')
clargs.add_argument('outdir', help='directory to output to.')
args = clargs.parse_args()
db = args.db
outdir = args.outdir

# Load the data
data = mldata.load_data(db)

# Get the feature names
_, _, _, featurenames = mldata.data_components(data)

# For each feature, save a new database with it, the label, and the fname
nonfeats = ['isMalware', 'Name']
for feat in featurenames:
    thisdata = mldata.only_features(data, nonfeats+[feat])
    fname = '{}{}.csv'.format(outdir, feat)
    print fname
    mldata.save_data(thisdata, open(fname, 'w'))
示例#5
0
def oldtrial(options):
    ''' Run a single machine learning trial.''' 
    # TODO: make option for loading intermediate data to skip steps that have
    # been done in previous trials

    # Select data to read
    data = mldata.load_data(options.database)

    # Get a sample
    if(options.numsamples != None): # Check to see if a sample was requested
        if(options.malfrac != None):
            sample = mldata.select_sample(int(options.seed), data, \
                options.numsamples, options.malfrac[0])
        else: # Only use a percent malware if one was specified
            sample = mldata.select_sample(int(options.seed), data,
                options.numsamples)
    else:
        sample = data

    # If specified, output the current database
    if(options.exportdb != None):
        mldata.save_data(sample, options.exportdb)

    # Original way to run a trial... probably going to be deleted eventually
    if(options.acc):
        return oldacc(options, sample)

    # Primary way to run a trial
    else:
        printparams(options)
        mlstat.print_results_header()
        perfmeasures = ['accuracy', 'precision', 'recall', 'f1']
        avgs = []
        for perfmeasure in perfmeasures:
            # Extract the parts of the samples
            # Not yet using the filenames and feature names
            features, labels, _, featnames = mldata.data_components(sample)

            # Split the sample into 10 randomly stratified folds
            cvsplits = cross_validation.StratifiedShuffleSplit(labels, \
                        test_size=0.1, random_state=options.seed)

            # Score the folds
            est = mlalgos.get_estimator(options.algorithm)
            scores = cross_validation.cross_val_score(est, features, y=labels, \
                        scoring=perfmeasure, cv=cvsplits)

            # Print the results
            avgs.append(sum(scores)/len(scores))
            avgstr = '{:.4}'.format(avgs[-1]).rjust(7)
            resultstr = '{}  {} '.format(perfmeasure.rjust(9), avgstr)
            for score in scores:
                resultstr += ' {:.3}'.format(score)
            print(resultstr)

            # Icing on the cake: draw a decision tree graph
            # based on the fold with the best f1 score
            if(perfmeasure=='f1' and options.graphfile != None and \
                isinstance(est, tree.DecisionTreeClassifier)):
                mlalgos.dt_graph(est, cvsplits, scores, features, labels, \
                                featnames, options.graphfile)

        return (perfmeasures, avgs)