def mainVisualize(params={}): withhold = 0 #default value for params params = test.defParams(params) train_dir = "train" test_dir = "test" # TODO put the names of the feature functions you've defined above in this list ffs = [system_call_2gram_feats] #[system_call_count_feats] #ffs = [first_last_system_call_feats, system_call_count_feats] # extract features print "extracting training features..." time1 = time.clock() X_train, t_train, train_ids, X_test, t_test, test_ids = test.loadData( params, withhold, ffs) time2 = time.clock() print "done extracting %d training features, time: %.4f s" % ( X_train.shape[1], time2 - time1) print import matplotlib.pyplot as plt import matplotlib.cm as cm plt.matshow(X_train.T.toarray(), cmap=cm.get_cmap('Reds')) plt.colorbar() plt.show()
def mainTest(withhold=0, params={}): #default value for params params = test.defParams(params) train_dir = "train" test_dir = "test" # TODO put the names of the feature functions you've defined above in this list ffs = [system_call_count_feats, system_call_2gram_feats] #ffs = [first_last_system_call_feats, system_call_count_feats] # extract features print "extracting training features..." time1 = time.clock() X_train, t_train, train_ids, X_test, y_test, test_ids = test.loadData( params, withhold, ffs) time2 = time.clock() print "done extracting %d training features, time: %.4f s" % ( X_train.shape[1], time2 - time1) print #preds = methods.logRegress(X_train,t_train,X_test) #preds = methods.decisionTree(X_train,t_train,X_test) #preds = methods.randomForest(X_train,t_train,X_test) preds = methods.extraTrees(X_train, t_train, X_test) if withhold != 0: print testCatAcc(preds, y_test) if params['writePredict'] == True: print "writing predictions..." util.write_predictions(preds, test_ids, params['outputFile']) print "done!"
def mainExamine(params=None): import matplotlib.pyplot as plt ffs = [metadata_feats, unigram_noStop] X, y, ids, _, _, _ = test.loadData(params, 0, ffs) plt.hist(y,bins=20) plt.show()
def mainTest(withhold=0, params=None): #default value for params if params==None: params = {'withhold': 0, 'load': None, 'extractFile': None, 'trainFile': None, 'testFile': None, 'writePredict': False, 'outputFile': 'predictions.csv' } trainfile = "train.xml" testfile = "testcases.xml" # TODO put the names of the feature functions you've defined above in this list #ffs = [metadata_feats, unigram_feats] ffs = [metadata_feats, unigram_noStop] #ffs = [metadata_feats, bigram_feats_noStop] #ffs = [metadata_feats, bigram_feats_noStop, unigram_noStop] #totRevLen, revLens #ffs = [metadata_feats, unigram_noStop, revLens] print "extracting training/testing features..." time1 = time.clock() X_train, y_train, train_ids,X_test,y_test,test_ids = test.loadData(params, withhold, ffs) time2 = time.clock() print "done extracting training/testing features", time2-time1, "s" print # TODO train here, and return regression parameters print "learning..." time1 = time.clock() #learned_w = splinalg.lsqr(X_train,y_train)[0] learned_w = splinalg.lsmr(X_train,y_train,damp=5000)[0] time2 = time.clock() print "done learning, ", time2-time1, "s" print # get rid of training data and load test data del X_train del y_train del train_ids # TODO make predictions on text data and write them out print "making predictions..." preds = X_test.dot(learned_w) print "done making predictions" print if withhold > 0: print "MAE on withheld data:", testMAE(preds, y_test) if params['writePredict']==True: print "writing predictions..." util.write_predictions(preds, test_ids, params['outputFile']) print "done!"
def mainTestIter(withhold=0, params=None): from sklearn import cross_validation import classification_methods as classif #default value for params if params == None: params = {} params = dict( { 'withhold': 0, 'load': None, 'extractFile': None, 'loadTest': False, # arguments to `learn` 'options': {}, # the option to cycle through 'option': None, # range of values to cycle through 'range': [], # k-fold cross-validation 'n_folds': 10, # names of feature functions to use 'ffs': ['system_call_count_feats', 'system_call_2gram_feats'] }, **params) train_dir = "train" test_dir = "test" # TODO put the names of the feature functions you've defined above in this list ffs = [feature_functions[f] for f in params['ffs']] print print "extracting training/testing features..." time1 = time.clock() # X_train, y_train, train_ids, X_test, y_test, test_ids = test.loadData(params, withhold, ffs) X, y, ids, _, _, _ = test.loadData(params, withhold, ffs) time2 = time.clock() print "done extracting training/testing features", time2 - time1, "s" print "%d data, %d features" % X.shape print # options for the learning engine options = params['options'] # array to store errors for various values of learning options errors = [] # iterate through each value of `params['option']` in `params['range']` # and calculate the error for that value print "iterating over values of %s from %s ... %s" % ( params['option'], params['range'][0], params['range'][-1]) print "================================================================================" for (i, value) in enumerate(params['range']): print "%s = %s" % (params['option'], str(value)) op = dict(options) op[params['option']] = value # generate k cross-validation folds kf = cross_validation.KFold(len(y), n_folds=params['n_folds'], shuffle=True) print "k-fold cross-validation with %d folds" % params['n_folds'] cv_err = [] # for each cv fold for train, tests in kf: # generate partition X_train, y_train, X_test, y_test = X[train], y[train], X[tests], y[ tests] # train and predict print "learning and predicting..." time1 = time.clock() preds = classif.classify(X_train, y_train, X_test, **op) time2 = time.clock() print "done learning and predicting, ", time2 - time1, "s" print # cross-validate cv_err.append(testCatErr(preds, y_test)) print "Err on withheld data: %f" % cv_err[-1] print # calculate mean, std. across folds cv_err_mean, cv_err_std = np.mean(cv_err), np.std(cv_err) print print "Avg. Err: %f" % cv_err_mean print "Std. Err: %f" % cv_err_std errors.append((cv_err_mean, cv_err_std)) print "--------------------------------------------------------------------------------" print "================================================================================" # tabulate results results = dict() print "Features:" print params['ffs'] print print "Options:" print options print print "Results:" print "%18s \t Err \t std" % params['option'] for (i, value) in enumerate(params['range']): print "%18s \t %f \t %f" % (value, errors[i][0], errors[i][1]) if (isinstance(value, list)): value = tuple(value) results[value] = errors[i] return results
def mainTestIter(withhold=0, params=None): from sklearn import cross_validation import learn #default value for params if params==None: params = {} params = dict({'withhold': 0, 'load': None, 'extractFile': None, 'trainFile': 'train.xml', 'testFile': 'testcases.xml', 'writePredict': False, 'outputFile': 'predictions.csv', # arguments to `learn` 'options': {}, # the option to cycle through 'option': None, # range of values to cycle through 'range': [], # k-fold cross-validation 'n_folds': 10 }, **params) trainfile = "train.xml" testfile = "testcases.xml" # TODO put the names of the feature functions you've defined above in this list ffs = [metadata_feats, unigram_noStop] print print "extracting training/testing features..." time1 = time.clock() # X_train, y_train, train_ids, X_test, y_test, test_ids = test.loadData(params, withhold, ffs) X, y, ids, _, _, _ = test.loadData(params, withhold, ffs) time2 = time.clock() print "done extracting training/testing features", time2-time1, "s" print "%d data, %d features" % X.shape print # options for the learning engine options = params['options'] # array to store MAEs for various values of learning options MAEs = [] print "iterating over values of %s from %s ... %s" % (params['option'], params['range'][0], params['range'][-1]) print "================================================================================" # iterate through each value of `params['option']` in `params['range']` # and calculate the MAE for that value for (i, value) in enumerate(params['range']): print "%s = %s" % (params['option'], str(value)) op = dict(options) op[params['option']] = value decomp = None # generate k cross-validation folds kf = cross_validation.KFold(len(y),n_folds=params['n_folds'],shuffle=True) print "k-fold cross-validation with %d folds" % params['n_folds'] cv_mae = [] # for each cv fold for train,tests in kf: # generate partition X_train, y_train, X_test, y_test = X[train], y[train], X[tests], y[tests] # train here, and return regression parameters print "learning..." time1 = time.clock() if 'reduction' in op and op['reduction'] != None: ((learned_w0, learned_w), decomp) = learn.learn(X_train, y_train, **op) else: (learned_w0, learned_w) = learn.learn(X_train, y_train, **op) time2 = time.clock() print "done learning, ", time2-time1, "s" print # make predictions print "making predictions..." if decomp is None: preds = X_test.dot(learned_w) + learned_w0 else: preds = decomp(X_test).dot(learned_w) + learned_w0 print "done making predictions" # cross-validate cv_mae.append(testMAE(preds, y_test)) print "MAE on withheld data: ", cv_mae[-1] print cv_mae_mean, cv_mae_std = np.mean(cv_mae), np.std(cv_mae) print print "Avg. MAE: %f" % cv_mae_mean print "Std. MAE: %f" % cv_mae_std MAEs.append((cv_mae_mean, cv_mae_std)) print "--------------------------------------------------------------------------------" print "================================================================================" # tabulate results results = dict() print "Options:" print options print print "Results:" print "%18s \t MAE \t std" % params['option'] for (i, value) in enumerate(params['range']): print "%18s \t %d \t %d" % (value, MAEs[i][0], MAEs[i][1]) if(isinstance(value, list)): value = tuple(value) results[value] = MAEs[i] return results