def mainVisualize(params={}):
    withhold = 0

    #default value for params
    params = test.defParams(params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [system_call_2gram_feats]  #[system_call_count_feats]
    #ffs = [first_last_system_call_feats, system_call_count_feats]

    # extract features
    print "extracting training features..."
    time1 = time.clock()
    X_train, t_train, train_ids, X_test, t_test, test_ids = test.loadData(
        params, withhold, ffs)
    time2 = time.clock()
    print "done extracting %d training features, time: %.4f s" % (
        X_train.shape[1], time2 - time1)
    print

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm

    plt.matshow(X_train.T.toarray(), cmap=cm.get_cmap('Reds'))
    plt.colorbar()
    plt.show()
def mainTest(withhold=0, params={}):
    #default value for params
    params = test.defParams(params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [system_call_count_feats, system_call_2gram_feats]
    #ffs = [first_last_system_call_feats, system_call_count_feats]

    # extract features
    print "extracting training features..."
    time1 = time.clock()
    X_train, t_train, train_ids, X_test, y_test, test_ids = test.loadData(
        params, withhold, ffs)
    time2 = time.clock()
    print "done extracting %d training features, time: %.4f s" % (
        X_train.shape[1], time2 - time1)
    print

    #preds = methods.logRegress(X_train,t_train,X_test)
    #preds = methods.decisionTree(X_train,t_train,X_test)
    #preds = methods.randomForest(X_train,t_train,X_test)
    preds = methods.extraTrees(X_train, t_train, X_test)

    if withhold != 0:
        print testCatAcc(preds, y_test)

    if params['writePredict'] == True:
        print "writing predictions..."
        util.write_predictions(preds, test_ids, params['outputFile'])
        print "done!"
Пример #3
0
def mainExamine(params=None):
    import matplotlib.pyplot as plt
    ffs = [metadata_feats, unigram_noStop]

    X, y, ids, _, _, _ = test.loadData(params, 0, ffs)
    plt.hist(y,bins=20)
    plt.show()
Пример #4
0
def mainTest(withhold=0, params=None):

    #default value for params
    if params==None:
        params = {'withhold': 0,
          'load': None,
          'extractFile': None,
          'trainFile': None,
          'testFile': None,
          'writePredict': False,
          'outputFile': 'predictions.csv'
          }

    trainfile = "train.xml"
    testfile = "testcases.xml"

    # TODO put the names of the feature functions you've defined above in this list
    #ffs = [metadata_feats, unigram_feats]
    ffs = [metadata_feats, unigram_noStop]
    #ffs = [metadata_feats, bigram_feats_noStop]
    #ffs = [metadata_feats, bigram_feats_noStop, unigram_noStop]
    #totRevLen, revLens
    #ffs = [metadata_feats, unigram_noStop, revLens]

    print "extracting training/testing features..."
    time1 = time.clock()
    X_train, y_train, train_ids,X_test,y_test,test_ids = test.loadData(params, withhold, ffs)
    time2 = time.clock()
    print "done extracting training/testing features", time2-time1, "s"
    print

    # TODO train here, and return regression parameters
    print "learning..."
    time1 = time.clock()
    #learned_w = splinalg.lsqr(X_train,y_train)[0]
    learned_w = splinalg.lsmr(X_train,y_train,damp=5000)[0]
    time2 = time.clock()
    print "done learning, ", time2-time1, "s"
    print

    # get rid of training data and load test data
    del X_train
    del y_train
    del train_ids

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = X_test.dot(learned_w)
    print "done making predictions"
    print

    if withhold > 0:
        print "MAE on withheld data:", testMAE(preds, y_test)

    if params['writePredict']==True:
        print "writing predictions..."
        util.write_predictions(preds, test_ids, params['outputFile'])
        print "done!"
def mainTestIter(withhold=0, params=None):
    from sklearn import cross_validation
    import classification_methods as classif

    #default value for params
    if params == None:
        params = {}

    params = dict(
        {
            'withhold': 0,
            'load': None,
            'extractFile': None,
            'loadTest': False,

            # arguments to `learn`
            'options': {},

            # the option to cycle through
            'option': None,

            # range of values to cycle through
            'range': [],

            # k-fold cross-validation
            'n_folds': 10,

            # names of feature functions to use
            'ffs': ['system_call_count_feats', 'system_call_2gram_feats']
        },
        **params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [feature_functions[f] for f in params['ffs']]

    print
    print "extracting training/testing features..."
    time1 = time.clock()
    # X_train, y_train, train_ids, X_test, y_test, test_ids = test.loadData(params, withhold, ffs)
    X, y, ids, _, _, _ = test.loadData(params, withhold, ffs)
    time2 = time.clock()
    print "done extracting training/testing features", time2 - time1, "s"
    print "%d data, %d features" % X.shape
    print

    # options for the learning engine
    options = params['options']

    # array to store errors for various values of learning options
    errors = []

    # iterate through each value of `params['option']` in `params['range']`
    # and calculate the error for that value
    print "iterating over values of %s from %s ... %s" % (
        params['option'], params['range'][0], params['range'][-1])
    print "================================================================================"
    for (i, value) in enumerate(params['range']):
        print "%s = %s" % (params['option'], str(value))
        op = dict(options)
        op[params['option']] = value

        # generate k cross-validation folds
        kf = cross_validation.KFold(len(y),
                                    n_folds=params['n_folds'],
                                    shuffle=True)
        print "k-fold cross-validation with %d folds" % params['n_folds']
        cv_err = []

        # for each cv fold
        for train, tests in kf:

            # generate partition
            X_train, y_train, X_test, y_test = X[train], y[train], X[tests], y[
                tests]

            # train and predict
            print "learning and predicting..."
            time1 = time.clock()

            preds = classif.classify(X_train, y_train, X_test, **op)
            time2 = time.clock()
            print "done learning and predicting, ", time2 - time1, "s"
            print

            # cross-validate
            cv_err.append(testCatErr(preds, y_test))
            print "Err on withheld data: %f" % cv_err[-1]
            print

        # calculate mean, std. across folds
        cv_err_mean, cv_err_std = np.mean(cv_err), np.std(cv_err)

        print
        print "Avg. Err: %f" % cv_err_mean
        print "Std. Err: %f" % cv_err_std
        errors.append((cv_err_mean, cv_err_std))

        print "--------------------------------------------------------------------------------"

    print "================================================================================"

    # tabulate results
    results = dict()

    print "Features:"
    print params['ffs']
    print
    print "Options:"
    print options
    print

    print "Results:"
    print "%18s \t Err \t std" % params['option']
    for (i, value) in enumerate(params['range']):
        print "%18s \t %f \t %f" % (value, errors[i][0], errors[i][1])
        if (isinstance(value, list)):
            value = tuple(value)

        results[value] = errors[i]

    return results
Пример #6
0
def mainTestIter(withhold=0, params=None):
    from sklearn import cross_validation
    import learn

    #default value for params
    if params==None:
        params = {}

    params = dict({'withhold': 0,
      'load': None,
      'extractFile': None,
      'trainFile': 'train.xml',
      'testFile': 'testcases.xml',
      'writePredict': False,
      'outputFile': 'predictions.csv',

      # arguments to `learn`
      'options': {},

      # the option to cycle through
      'option': None,

      # range of values to cycle through
      'range': [],

      # k-fold cross-validation
      'n_folds': 10
    }, **params)

    trainfile = "train.xml"
    testfile = "testcases.xml"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [metadata_feats, unigram_noStop]

    print
    print "extracting training/testing features..."
    time1 = time.clock()
    # X_train, y_train, train_ids, X_test, y_test, test_ids = test.loadData(params, withhold, ffs)
    X, y, ids, _, _, _ = test.loadData(params, withhold, ffs)
    time2 = time.clock()
    print "done extracting training/testing features", time2-time1, "s"
    print "%d data, %d features" % X.shape
    print



    # options for the learning engine
    options = params['options']

    # array to store MAEs for various values of learning options
    MAEs = []

    print "iterating over values of %s from %s ... %s" % (params['option'], params['range'][0], params['range'][-1])
    print "================================================================================"
    # iterate through each value of `params['option']` in `params['range']`
    # and calculate the MAE for that value
    for (i, value) in enumerate(params['range']):
        print "%s = %s" % (params['option'], str(value))
        op = dict(options)
        op[params['option']] = value
        decomp = None

        # generate k cross-validation folds
        kf = cross_validation.KFold(len(y),n_folds=params['n_folds'],shuffle=True)
        print "k-fold cross-validation with %d folds" % params['n_folds']
        cv_mae = []

        # for each cv fold
        for train,tests in kf:

            # generate partition
            X_train, y_train, X_test, y_test = X[train], y[train], X[tests], y[tests]

            # train here, and return regression parameters
            print "learning..."
            time1 = time.clock()
            if 'reduction' in op and op['reduction'] != None:
                ((learned_w0, learned_w), decomp) = learn.learn(X_train, y_train, **op)
            else:
                (learned_w0, learned_w) = learn.learn(X_train, y_train, **op)

            time2 = time.clock()
            print "done learning, ", time2-time1, "s"
            print


            # make predictions
            print "making predictions..."
            if decomp is None:
                preds = X_test.dot(learned_w) + learned_w0
            else:
                preds = decomp(X_test).dot(learned_w) + learned_w0
            print "done making predictions"

            # cross-validate
            cv_mae.append(testMAE(preds, y_test))
            print "MAE on withheld data: ", cv_mae[-1]
            print


        cv_mae_mean, cv_mae_std = np.mean(cv_mae), np.std(cv_mae)

        print
        print "Avg. MAE: %f" % cv_mae_mean
        print "Std. MAE: %f" % cv_mae_std
        MAEs.append((cv_mae_mean, cv_mae_std))

        print "--------------------------------------------------------------------------------"

    print "================================================================================"

    # tabulate results
    results = dict()
    print "Options:"
    print options
    print

    print "Results:"
    print "%18s \t MAE \t std" % params['option']
    for (i, value) in enumerate(params['range']):
        print "%18s \t %d \t %d" % (value, MAEs[i][0], MAEs[i][1])
        if(isinstance(value, list)):
            value = tuple(value)

        results[value] = MAEs[i]

    return results