def do_prediction(train_dir, test_dir, outputfile, ffs): # extract features print "extracting training features..." X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir) print X_train print "done extracting training features" print # TODO train here, and learn your classification parameters print "learning..." # learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes))) learn.TRAINING_FUNCTION(X_train, global_feat_dict, t_train, train_ids) print "done learning" print # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." # preds = np.argmax(X_test.dot(learned_W),axis=1) preds = learn.TESTING_FUNCTION(X_test, global_feat_dict, test_ids) # preds = learn.logistic_regression(X_train, X_test, global_feat_dict, t_train, train_ids, test_ids) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def mainTest(withhold=0, params={}): #default value for params params = test.defParams(params) train_dir = "train" test_dir = "test" # TODO put the names of the feature functions you've defined above in this list ffs = [system_call_count_feats, system_call_2gram_feats] #ffs = [first_last_system_call_feats, system_call_count_feats] # extract features print "extracting training features..." time1 = time.clock() X_train, t_train, train_ids, X_test, y_test, test_ids = test.loadData( params, withhold, ffs) time2 = time.clock() print "done extracting %d training features, time: %.4f s" % ( X_train.shape[1], time2 - time1) print #preds = methods.logRegress(X_train,t_train,X_test) #preds = methods.decisionTree(X_train,t_train,X_test) #preds = methods.randomForest(X_train,t_train,X_test) preds = methods.extraTrees(X_train, t_train, X_test) if withhold != 0: print testCatAcc(preds, y_test) if params['writePredict'] == True: print "writing predictions..." util.write_predictions(preds, test_ids, params['outputFile']) print "done!"
def syscall_count_by_type(): mat,key,cats = pickle.load(open('matrix_train', 'rb')) test_mat,ids = pickle.load(open('matrix_test', 'rb')) clf = tree.DecisionTreeClassifier() clf = clf.fit(mat,cats) util.write_predictions(clf.predict(test_mat),ids, 'syscall_count_by_type-3.csv')
def main(): train_dir = "train" test_dir = "test" outputfile = "sample_predictions.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list # ffs = [first_last_system_call_feats, system_call_count_feats, frequency] #ffs = [quadgrams] ffs = [first_last_system_call_feats, quadgrams] # extract features print "extracting training features..." X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) # print X_train # Not currently a np.array, need to do .toarray() # print global_feat_dict print "done extracting training features" print # TODO train here, and learn your classification parameters print "learning..." # rf = RandomForestClassifier(max_features = 2750, max_depth = 28) # rf.fit(X_train.toarray(), t_train) nn = MLPClassifier(max_iter=10000, hidden_layer_sizes=(320, )) nn.fit(X_train.toarray(), t_train) # rf = RandomForestClassifier(max_features = 100, max_depth = 90) # rf.fit(X_train.toarray(), t_train) print "done learning" print # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." # preds = rf.predict(X_test.toarray()) preds = nn.predict(X_test.toarray()) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def nb_test_data(test_data, sent_model: NaiveBayesModel, auth_model: NaiveBayesModel, output_filename: str): predictions = [] for (review_id, review_text) in test_data: sent_class = nb_predict_sentiment(sent_model, review_text) auth_class = nb_predict_authenticity(auth_model, review_text) predictions.append((review_id, auth_class, sent_class)) write_predictions(predictions, output_filename)
def mainTest(withhold=0, params=None): #default value for params if params==None: params = {'withhold': 0, 'load': None, 'extractFile': None, 'trainFile': None, 'testFile': None, 'writePredict': False, 'outputFile': 'predictions.csv' } trainfile = "train.xml" testfile = "testcases.xml" # TODO put the names of the feature functions you've defined above in this list #ffs = [metadata_feats, unigram_feats] ffs = [metadata_feats, unigram_noStop] #ffs = [metadata_feats, bigram_feats_noStop] #ffs = [metadata_feats, bigram_feats_noStop, unigram_noStop] #totRevLen, revLens #ffs = [metadata_feats, unigram_noStop, revLens] print "extracting training/testing features..." time1 = time.clock() X_train, y_train, train_ids,X_test,y_test,test_ids = test.loadData(params, withhold, ffs) time2 = time.clock() print "done extracting training/testing features", time2-time1, "s" print # TODO train here, and return regression parameters print "learning..." time1 = time.clock() #learned_w = splinalg.lsqr(X_train,y_train)[0] learned_w = splinalg.lsmr(X_train,y_train,damp=5000)[0] time2 = time.clock() print "done learning, ", time2-time1, "s" print # get rid of training data and load test data del X_train del y_train del train_ids # TODO make predictions on text data and write them out print "making predictions..." preds = X_test.dot(learned_w) print "done making predictions" print if withhold > 0: print "MAE on withheld data:", testMAE(preds, y_test) if params['writePredict']==True: print "writing predictions..." util.write_predictions(preds, test_ids, params['outputFile']) print "done!"
def mainTestPred(withhold=0, params=None): from sklearn import cross_validation import classification_methods as classif #default value for params if params == None: params = {} params = dict( { 'withhold': 0, 'load': None, 'extractFile': None, # arguments to `learn` 'options': {}, # k-fold cross-validation 'n_folds': 10, # feature functions 'ffs': ['system_call_unigram_feats'] }, **params) op = dict(params['options']) train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [feature_functions[f] for f in params['ffs']] # extract features print "extracting training features..." X_train, global_feat_dict, y_train, train_ids = extract_feats( ffs, train_dir) print "done extracting training features" print print "extracting test features..." X_test, _, y_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = classif.classify(X_train, y_train, X_test, **op) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, params['outputFile']) print "done!"
def predict(train, test, pred_file): y_hat, train_rss = run_model(train, test, 'prediction', 0) for i, yi in enumerate(y_hat): if yi < 0: y_hat[i] = 0 if yi > 5: y_hat[i] = 5 for i, entry in enumerate(test): entry['rating'] = float(y_hat[i]) util.write_predictions(test, pred_file)
def screens_budget_summer_lglglg(): mat,key,regy,_ = rs.extract_feats([rs.metadata_feats]) screen_ind = key['number_of_screens'] budget_ind = key['production_budget'] summer_ind = key['summer_release'] screens = mat.getcol(screen_ind).todense() budget = mat.getcol(budget_ind).todense() summer = mat.getcol(summer_ind).todense() def safelog(x): if x <= 0.: return 0. else: return math.log(x) fns = [safelog, safelog, safelog, safelog] bs_check = lambda x:x[1] > 0. and x[2] > 0. bns_check = lambda x:x[1] > 0. and x[2] == 0. nbs_check = lambda x:x[1] == 0. and x[2] > 0. nbns_check = lambda x:x[1] == 0. and x[2] == 0. bs_arr = format_arr([screens,budget,summer], regy, fns, bs_check) bns_arr = format_arr([screens,budget,summer], regy, fns, bns_check) nbs_arr = format_arr([screens,budget,summer], regy, fns, nbs_check) nbns_arr = format_arr([screens,budget,summer], regy, fns, nbns_check) budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2, lambda x:x[1], lambda x:x[1]**2] no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2] bs_coeffs = freg.coeffs(budget_basis_fns, bs_arr) bns_coeffs = freg.coeffs(budget_basis_fns, bns_arr) nbs_coeffs = freg.coeffs(no_budget_basis_fns, nbs_arr) nbns_coeffs = freg.coeffs(no_budget_basis_fns, nbns_arr) test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml', global_feat_dict = key) test_len = test.shape[0] preds = [] for i in range(test_len): prod = 0 x = [test[i,screen_ind], test[i,budget_ind], test[i,summer_ind]] logx = tuple([safelog(feat) for feat in x]) if bs_check(x): prod = freg.product(logx, bs_coeffs, budget_basis_fns) elif bns_check(x): prod = freg.product(logx, bns_coeffs, budget_basis_fns) elif nbs_check(x): prod = freg.product(logx, nbs_coeffs, no_budget_basis_fns) elif nbns_check(x): prod = freg.product(logx, nbns_coeffs, no_budget_basis_fns) if prod < 0: prod = 0 preds.append(math.e**prod) util.write_predictions(preds, ids, 'screens_budget_summer_lglglg-2.csv')
def main(): train_dir = "train" test_dir = "test" outputfile = "sample_predictions.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [first_last_system_call_feats, system_call_count_feats] # extract features print "extracting training features..." X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir) print "done extracting training features" print print "global_feat_dict" pprint(global_feat_dict) print "t_train" pprint(t_train) # TODO train here, and learn your classification parameters print "learning..." learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes))) print "learned_W" pprint(learned_W) print "done learning" print # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test,quoi,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print "Quoi" pprint(quoi) print "t_ignore" pprint(t_ignore) print # TODO make predictions on text data and write them out print "making predictions..." preds = np.argmax(X_test.dot(learned_W),axis=1) print "preds" pprint(preds) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def main(): train_dir = "train" test_dir = "test" outputfile = "logistic.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [system_call_counts, system_call_count_feats] # extract features print "extracting training features..." X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) print "done extracting training features" print # TODO train here, and learn your classification parameters print "learning..." # RF = RandomForestClassifier() # RF.fit(X_train, t_train) #learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes))) X_train = X_train.toarray() y_train = to_categorical(t_train) model = Sequential() model.add(Dense(32, activation='relu', input_dim=X_train.shape[1])) model.add(Dense(y_train.shape[1], activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy') model.fit(X_train, y_train, epochs=200, batch_size=64) print "done learning" print # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." # preds = RF.predict(X_test) #preds = np.argmax(X_test.dot(learned_W),axis=1) preds_vec = model.predict(X_test.toarray()) preds = np.argmax(preds_vec, axis=1) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def main(saved_extraction=None, type_clf='tree', nb_tree=20): from sklearn.ensemble import RandomForestClassifier train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" #YOU ADD HERE THE NEW FUNCTIONS THEY HAVE TO RETURN A COUNTER CLASS ffs = [first_last_system_call_feats, system_call_count_feats, syscall_name_counter, dll_type, failure_success,string_entropy, Api_call_counter] if saved_extraction: X_train,global_feat_dict,t_train,train_ids = np.load('train_extract.npy') X_test,_,t_ignore,test_ids = np.load('test_extract.npy') else: X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir) X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict) np.save('train_extract.npy',(X_train,global_feat_dict,t_train,train_ids)) np.save('test_extract.npy',(X_test,_,t_ignore,test_ids)) #CrossValidation for mScoring Purposes print "Number of Feature used in the analysis :", X_train.shape Class_weight=np.array([3.69,1.62,1.2,1.03,1.33,1.26,1.72,1.33,52.14,0.68,17.56,1.04,12.18,1.91,1.3])/100.0 if type_clf=='tree': clf = RandomForestClassifier(n_estimators=nb_tree) elif type_clf=='Etree': clf = ExtraTreesClassifier(n_estimators=nb_tree) elif type_clf=='SVC': clf = svm.SVC(kernel='rbf') if type_clf =='tree' or type_clf=='Etree': weight=[] for i in range(0,len(t_train[:int(len(X_train)*0.75)])): ind=t_train[i] weight.append(Class_weight[ind]) clf.fit(X_train[:int(len(X_train)*0.75)], t_train[:int(len(X_train)*0.75)], sample_weight=weight) else: clf.fit(X_train[:int(len(X_train)*0.75)], t_train[:int(len(X_train)*0.75)]) CV_hat = clf.predict(X_train[int(len(X_train)*0.75):]) d = (t_train[int(len(X_train)*0.75):] == CV_hat) print "Estimation is:",float(d.sum())/len(d) if type_clf=='tree': clf = RandomForestClassifier(n_estimators=nb_tree) elif type_clf=='SVC': clf = svm.SVC(kernel='rbf') elif type_clf=='Etree': clf = ExtraTreesClassifier(n_estimators=nb_tree) if type_clf =='tree' or type_clf=='Etree': weight=[] for i in range(0,len(t_train)): ind=t_train[i] weight.append(Class_weight[ind]) clf.fit(X_train, t_train, sample_weight=weight) else: clf.fit(X_train, t_train) t_hat = clf.predict(X_test) util.write_predictions(t_hat, test_ids, outputfile)
def main(saved_extraction=None, type_clf='tree', nb_tree=20): from sklearn.ensemble import RandomForestClassifier train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" #YOU ADD HERE THE NEW FUNCTIONS THEY HAVE TO RETURN A COUNTER CLASS ffs = [ first_last_system_call_feats, system_call_count_feats, syscall_name_counter, dll_type, failure_success ] if saved_extraction: X_train, global_feat_dict, t_train, train_ids = np.load( 'train_extract.npy') X_test, _, t_ignore, test_ids = np.load('test_extract.npy') else: X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) np.save('train_extract.npy', (X_train, global_feat_dict, t_train, train_ids)) np.save('test_extract.npy', (X_test, _, t_ignore, test_ids)) # clf = svm.SVC() # CrossValidation for mScoring Purposes if type_clf == 'tree': clf = RandomForestClassifier(n_estimators=nb_tree) elif type_clf == 'Etree': clf = ExtraTreesClassifier(n_estimators=nb_tree) elif type_clf == 'SVC': clf = svm.SVC() clf.fit(X_train[:int(len(X_train) * 0.75)], t_train[:int(len(X_train) * 0.75)]) CV_hat = clf.predict(X_train[int(len(X_train) * 0.75):]) d = (t_train[int(len(X_train) * 0.75):] == CV_hat) print "Estimation is:", float(d.sum()) / len(d) if type_clf == 'tree': clf = RandomForestClassifier(n_estimators=nb_tree) elif type_clf == 'SVC': clf = svm.SVC() elif type_clf == 'Etree': clf = ExtraTreesClassifier(n_estimators=nb_tree) clf.fit(X_train, t_train) t_train_hat = clf.predict(X_train) t_hat = clf.predict(X_test) util.write_predictions(t_hat, test_ids, outputfile) return X_train, global_feat_dict, t_train, train_ids
def main(): X_train, t_train, train_ids = create_data_matrix(0, 10000, TRAIN_DIR) # X_valid, t_valid, valid_ids = create_data_matrix(1000, 2000, TRAIN_DIR) X_test, t_test, test_ids = create_data_matrix(0, 3724, TEST_DIR) # print 'Data matrix (training set):' # print np.array(X_train) # print 'Classes (training set):' # print np.array(t_train) clf = RandomForestClassifier(n_estimators=20, max_depth=None, max_features=1, criterion="gini", min_samples_split=1, min_samples_leaf=1, bootstrap=False) # clf = Regressor( # layers=[ # Layer("Rectifier", units=100), # Layer("Linear")], # learning_rate=0.001, # n_iter=100000) # use a full grid over all parameters # param_grid = {"max_depth": [3, None], # "max_features": [1, 3, 10], # "min_samples_split": [1, 3, 10], # "min_samples_leaf": [1, 3, 10], # "bootstrap": [True, False], # "criterion": ["gini", "entropy"]} # # run grid search # grid_search = GridSearchCV(clf, param_grid=param_grid) # grid_search.fit(X_train, t_train) # preds = grid_search.predict(X_test) # print grid_search.best_params_ clf = clf.fit(X_train, t_train) preds = clf.predict(X_test) # right = 0 # wrong = 0 # for p, pred in enumerate(preds): # if np.round(pred) == t_valid[p]: # right +=1 # else: # wrong +=1 # print right # print wrong ut.write_predictions(preds, test_ids, "result.csv")
def main(): print "# Loading features..." X_train, t_train, _ = pickle.load(open("../../features/all_tags/train.pickle")) X_test, _, test_ids = pickle.load(open("../../features/all_tags/test.pickle")) print "# Training RandomForestClassifier on train data..." RFC = RandomForestClassifier(n_estimators = 40, n_jobs = -1) RFC.fit(X_train, t_train) print "# Predicting test data..." pred = RFC.predict(X_test) util.write_predictions(pred, test_ids, "../../predictions/single_RF_predictions.csv") print "# Done!"
def main(): train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" # feel free to change this or take it as an argument lr = cl.LogisticRegression() knn = cl.kNN() ds = ff.Dataset() print "training..." X, y, ids = ds.getDataset(train_dir) lr.fit(X,y) knn.fit(X,y) del X del y print "training complete. Now preparing for submit" X, y, ids = ds.getDataset(test_dir) predsLR = lr.predict(X) pbLR = lr.classifier_().predict_proba(X) predskNN = knn.predict(X) pbkNN = knn.classifier_().predict_proba(X) featDict = ds.getFeatureDict() #print "feature", featDict['Swizzor_found'] X_arr = X.toarray() finalpred = [] for i in xrange(len(predsLR)): if X_arr[i][featDict['Swizzor_found']] > 0: choice = 10 elif np.max(pbkNN[i]) > 0.8: # if kNN is more .8 sure, it is very accurate choice = predskNN[i] elif np.max(pbkNN[i]) - np.max(pbLR[i]) > 0.4: # if kNN is 0.4 more sure than LR, use that choice = predskNN[i] else: choice = predsLR[i] finalpred.append(choice) print "writing predictions..." util.write_predictions(finalpred, ids, outputfile) print "done!"
def main(): train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" # feel free to change this or take it as an argument lr = cl.LogisticRegression() knn = cl.kNN() ds = ff.Dataset() print "training..." X, y, ids = ds.getDataset(train_dir) lr.fit(X, y) knn.fit(X, y) del X del y print "training complete. Now preparing for submit" X, y, ids = ds.getDataset(test_dir) predsLR = lr.predict(X) pbLR = lr.classifier_().predict_proba(X) predskNN = knn.predict(X) pbkNN = knn.classifier_().predict_proba(X) featDict = ds.getFeatureDict() #print "feature", featDict['Swizzor_found'] X_arr = X.toarray() finalpred = [] for i in xrange(len(predsLR)): if X_arr[i][featDict['Swizzor_found']] > 0: choice = 10 elif np.max(pbkNN[i]) > 0.8: # if kNN is more .8 sure, it is very accurate choice = predskNN[i] elif np.max(pbkNN[i]) - np.max(pbLR[i]) > 0.4: # if kNN is 0.4 more sure than LR, use that choice = predskNN[i] else: choice = predsLR[i] finalpred.append(choice) print "writing predictions..." util.write_predictions(finalpred, ids, outputfile) print "done!"
def main(): train_dir = "train" test_dir = "test" outputfile = "013.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [ first_last_system_call_feats, system_call_count_feats, count_all_feats, count_all_reasons, count_all_flags ] # extract features print "extracting training features..." X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) print "done extracting training features" print # TODO train here, and learn your classification parameters print "learning..." model = RandomForestClassifier(n_estimators=300, n_jobs=-1) model.fit(X_train, t_train) print "done learning" print # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = model.predict(X_test) # preds = np.argmax(X_test.dot(learned_W),axis=1) print preds print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def GetProduceSolutionResults(self, request, context): """ TA2-3 API call """ logging.critical("Message received: GetProduceSolutionResults") request_id = request.request_id request_params = self._solution_score_map[request_id] start = solutiondescription.compute_timestamp() solution_id = request_params.fitted_solution_id solution = self._solutions[solution_id] inputs = self._get_inputs(solution.problem, request_params.inputs) try: output = solution.produce(inputs=inputs, solution_dict=self._solutions)[0] logging.critical("Produce predictions with rows = %s", len(output)) except: logging.critical("Exception in produce: %s", solution.primitives) logging.critical("Exception in produce: %s", sys.exc_info()[0]) output = None result = None search_id_str = self._solution_to_search[solution_id] outputDir = os.environ['D3MOUTPUTDIR'] + "/" + search_id_str if output is not None: uri = util.write_predictions(output, outputDir + "/predictions", request_id) uri = 'file://{uri}'.format(uri=os.path.abspath(uri)) result = value_pb2.Value(csv_uri=uri) else: result = value_pb2.Value(error=value_pb2.ValueError( message="Output is NULL")) self._solution_score_map.pop(request_id, None) msg = core_pb2.Progress(state=core_pb2.COMPLETED, status="", start=start, end=solutiondescription.compute_timestamp()) steps = [] for i in range(solution.num_steps()): steps.append(core_pb2.StepProgress(progress=msg)) exposed_outputs = {} if request_params.expose_outputs is not None and len( request_params.expose_outputs) > 0: last_step_output = request_params.expose_outputs[ len(request_params.expose_outputs) - 1] else: last_step_output = solution.outputs[0][2] exposed_outputs[last_step_output] = result yield core_pb2.GetProduceSolutionResultsResponse( progress=msg, steps=steps, exposed_outputs=exposed_outputs)
def get_comparable_performance_test(): result = write_predictions(raw_test_vua, test_dataloader_vua, RNNseq_model, using_GPU, '../data/VUAsequence/VUA_seq_formatted_test.csv') f = open('../predictions/vua_seq_test_predictions_LSTMsequence_vua.csv', 'w') writer = csv.writer(f) writer.writerows(result) f.close() get_performance_VUAverb_test() get_performance_VUA_test()
def screens(basis_fns, fns, inv_fn, outfile): mat,key,regy,_ = rs.extract_feats([rs.metadata_feats]) screen_ind = key['number_of_screens'] screens = mat.getcol(screen_ind).todense() train_arr = format_arr([screens], regy, fns) coeffs = freg.coeffs(basis_fns, train_arr) test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml', global_feat_dict = key) test_len = test.shape[0] preds = [] for i in range(test_len): prod = freg.product((fns[0](test[i,screen_ind]),), coeffs, basis_fns) if prod < 0: prod = 0 preds.append(inv_fn(prod)) util.write_predictions(preds, ids, outfile)
def main(): X_train, t_train, train_ids = create_data_matrix(0, 10000, TRAIN_DIR) # X_valid, t_valid, valid_ids = create_data_matrix(1000, 2000, TRAIN_DIR) X_test, t_test, test_ids = create_data_matrix(0,3724, TEST_DIR) # print 'Data matrix (training set):' # print np.array(X_train) # print 'Classes (training set):' # print np.array(t_train) clf = RandomForestClassifier(n_estimators=20, max_depth = None, max_features =1, criterion = "gini", min_samples_split = 1, min_samples_leaf = 1, bootstrap = False) # clf = Regressor( # layers=[ # Layer("Rectifier", units=100), # Layer("Linear")], # learning_rate=0.001, # n_iter=100000) # use a full grid over all parameters # param_grid = {"max_depth": [3, None], # "max_features": [1, 3, 10], # "min_samples_split": [1, 3, 10], # "min_samples_leaf": [1, 3, 10], # "bootstrap": [True, False], # "criterion": ["gini", "entropy"]} # # run grid search # grid_search = GridSearchCV(clf, param_grid=param_grid) # grid_search.fit(X_train, t_train) # preds = grid_search.predict(X_test) # print grid_search.best_params_ clf = clf.fit(X_train, t_train) preds = clf.predict(X_test) # right = 0 # wrong = 0 # for p, pred in enumerate(preds): # if np.round(pred) == t_valid[p]: # right +=1 # else: # wrong +=1 # print right # print wrong ut.write_predictions(preds, test_ids, "result.csv")
def prediction(train_valid, test, pred_filename): import data_processing as dp dphelper = dp.data_processing() dense_train, sparse_train = dphelper.split(train_valid) dense_test, sparse_test = dphelper.split(test) ####### import sgd_bias as sgd y_hat_dense, train_rmse_dense = sgd.sgd_bias(dense_train, dense_test, 'prediction') import baseline as bs y_hat_sparse, train_rmse_sparse = bs.baseline(sparse_train, sparse_test, 'prediction') ####### print 'dense subset train rmse: %.16f' % train_rmse_dense print 'sparse subset train rmse: %.16f' % train_rmse_sparse test = dphelper.merge(test, y_hat_dense, y_hat_sparse) util.write_predictions(test, pred_filename)
def runCosine(training_set, user_list, validation_set, test_queries): global dataChoice users = {} for row in training_set: user_id = row['user'] isbn = row['isbn'] if not user_id in users: users[user_id] = {} users[user_id]['ratings'] = {} users[user_id]['ratings'][isbn] = row['rating'] # calculate cosine distance and find closest match cosine.topMatch(users) # find mean rating per book books, global_mean = cosine.meanPerItem(users) total_error = 0.0 sample_count = 0 if dataChoice == 'validate': print "user\tprediction\tactual" for row in validation_set: user = row['user'] isbn = row['isbn'] prediction = cosine.predict(users, user, books, isbn, global_mean) print user, "\t", prediction, "\t\t", row['rating'] total_error += abs(prediction - row['rating']) sample_count += 1 return total_error / sample_count else: # dataChoice = 'full' for query in test_queries: user_id = query['user'] isbn = query['isbn'] query['rating'] = cosine.predict(users, user_id, books, isbn, global_mean) # Write the prediction file. util.write_predictions(test_queries, pred_filename)
def runCosine(training_set,user_list, validation_set, test_queries): global dataChoice users = {} for row in training_set: user_id = row['user'] isbn = row['isbn'] if not user_id in users: users[user_id] = {} users[user_id]['ratings'] = {} users[user_id]['ratings'][isbn] = row['rating'] # calculate cosine distance and find closest match cosine.topMatch(users) # find mean rating per book books, global_mean = cosine.meanPerItem(users) total_error = 0.0 sample_count = 0 if dataChoice == 'validate': print "user\tprediction\tactual" for row in validation_set: user = row['user'] isbn = row['isbn'] prediction = cosine.predict(users,user,books,isbn,global_mean) print user,"\t",prediction,"\t\t",row['rating'] total_error += abs(prediction - row['rating']) sample_count += 1 return total_error / sample_count else: # dataChoice = 'full' for query in test_queries: user_id = query['user'] isbn = query['isbn'] query['rating'] = cosine.predict(users,user_id,books,isbn,global_mean) # Write the prediction file. util.write_predictions(test_queries, pred_filename)
def main(X_train=None, global_feat_dict=None): trainfile = "train.xml" testfile = "testcases.xml" outputfile = "mypredictions2.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [metadata_feats, unigram_feats] if X_train == None and global_feat_dict == None: # extract features print "extracting training features..." X_train,global_feat_dict,y_train,train_ids = extract_feats(ffs, trainfile) print "done extracting training features" print # TODO train here, and return regression parameters print "learning..." #learned_w = splinalg.lsqr(X_train,y_train)[0] learned_w = splinalg.lsmr(X_train,y_train)[0] print "done learning" print # get rid of training data and load test data del X_train del y_train del train_ids print "extracting test features..." X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = X_test.dot(learned_w) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def main(): train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [first_last_system_call_feats, system_call_count_feats] # extract features print "extracting training features..." X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir) print "done extracting training features" print # TODO train here, and learn your classification parameters print "learning..." learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes))) print "done learning" print # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = np.argmax(X_test.dot(learned_W),axis=1) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def main(): train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" # feel free to change this or take it as an argument # extract features print "extracting training features..." X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) print "done extracting training features" print # TODO train here, and learn your classification parameters print "learning..." learned_W = np.random.random( (len(global_feat_dict), len(util.malware_classes))) print "done learning" print # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = np.argmax(X_test.dot(learned_W), axis=1) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def main(): print "# Loading features..." X_train, t_train, _ = pickle.load(open("../../features/all_tags/train.pickle")) X_test, _, test_ids = pickle.load(open("../../features/all_tags/test.pickle")) dtrain = xgb.DMatrix(X_train, label=t_train) print "# Training XGBoost on training data..." param = {'bst:max_depth':30, 'eta':0.1, 'silent':2, 'objective':'multi:softprob', 'num_class': 15 } param['eval_metric'] = 'merror' param['min_child_weight'] = 3 param['nthread'] = 16 param['colsample_bytree'] = 0.5 evallist = [(dtrain,'train')] bst = xgb.train(param, dtrain, 500, evallist) print "# Predicting test data..." dout = xgb.DMatrix(X_test) t_probs = bst.predict(dout) t_pred = [prob.tolist().index(max(prob)) for prob in t_probs] util.write_predictions(t_pred, test_ids, "../../predictions/xgboost_predictions.csv") print "# Done!"
def screens_budget_lglglg(): mat,key,regy,_ = rs.extract_feats([rs.metadata_feats]) screen_ind = key['number_of_screens'] budget_ind = key['production_budget'] screens = mat.getcol(screen_ind).todense() budget = mat.getcol(budget_ind).todense() budget_fns = [lambda x:math.log(x) for i in range(3)] budget_check = lambda x:x[1] > 0. budget_arr = format_arr([screens,budget], regy, budget_fns, budget_check) no_budget_arr = format_arr([screens],regy,[lambda x:math.log(x),lambda x:math.log(x)]) budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2, lambda x:x[1], lambda x:x[1]**2] no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2] budget_coeffs = freg.coeffs(budget_basis_fns, budget_arr) no_budget_coeffs = freg.coeffs(no_budget_basis_fns, no_budget_arr) test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml', global_feat_dict = key) test_len = test.shape[0] preds = [] for i in range(test_len): prod = 0 if test[i,budget_ind] > 0.: x = (budget_fns[0](test[i,screen_ind]), budget_fns[1](test[i,budget_ind])) prod = freg.product(x, budget_coeffs, budget_basis_fns) else: x = (math.log(test[i,screen_ind]),) prod = freg.product(x, no_budget_coeffs, no_budget_basis_fns) if prod < 0: prod = 0 preds.append(math.e**prod) util.write_predictions(preds, ids, 'screens_budget_lglglg-2.csv')
def main(): final_ids = [] final_prediction = [] # fetch features for training and test data # substitute for a pickle load, for training data! print "# Loading Features..." X_train, t_train, train_ids = pickle.load(open("../../features/all_tags/train.pickle")) X_test, t_test, test_ids = pickle.load(open("../../features/all_tags/test.pickle")) # separates the t_train only between 0 and 1, where 0 is None and 1 # is any Malware none = util.malware_classes.index("None") t_train_bin = [0 if x == none else 1 for x in t_train] t_test_bin = [0 if x == none else 1 for x in t_test] # train a Random Forest on the data, using a binary classification only # (between Malware and None) print "# Training RandomForestClassifier with n_estimators = {}, for a binary classification between Malware or None...".format(N) RFC_bin = RandomForestClassifier(n_estimators = N, n_jobs = -1) RFC_bin.fit(X_train, t_train_bin) print "# Predicting Malware vs None..." # predict whether the testation inputs are Malwares or Nones pred_bin = RFC_bin.predict(X_test) # fetch all datapoints that we considered as Malwares X_test_malware = [] t_test_malware = [] test_ids_malware = [] for predicted, ID, true, features in zip(pred_bin, test_ids, t_test, X_test): # if we predicted None, this goes to our final prediction # otherwise, we add it to X_test_malware if predicted == 0: final_prediction.append(none) final_ids.append(ID) else: X_test_malware.append(features) t_test_malware.append(true) test_ids_malware.append(ID) # fetch all the Malwares X_train_malware = [] t_train_malware = [] for true, features in zip(t_train, X_train): if true != util.malware_classes.index("None"): X_train_malware.append(features) t_train_malware.append(true) np.asarray(X_train_malware) np.asarray(t_train_malware) print "# Training another RandomForestClassifier with n_estimators = {}, for a multi-class classification between only Malwares..." # train a Random Forest on the data, using now only the Malwares RFC_malware = RandomForestClassifier(n_estimators = 64, n_jobs = -1, class_weight = 'balanced') RFC_malware.fit(X_train_malware, t_train_malware) print "# Predicting whatever we had not classified as None before..." pred_malware = RFC_malware.predict(X_test_malware) for predicted, ID in zip(pred_malware, test_ids_malware): final_prediction.append(predicted) final_ids.append(ID) util.write_predictions(final_prediction, final_ids, "../../predictions/multi_classifier_predictions.csv") print "# Done!"
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--cross_validate',help='Run cross-validation (instead of of output)', action='store_true') parser.add_argument('-t', '--train_file', nargs=1, help='Training file') args = parser.parse_args() if args.train_file: trainfile = args.train_file[0] else: trainfile = "train.xml" testfile = "testcases.xml" outputfile = "mypredictions2.csv" # feel free to change this or take it as an argument # put the names of the feature functions you've defined above in this list ffs = [metadata_feats, squared_terms]#, prod_company, review_score] #, review_terms] #, unigram_feats, threshold_terms] # extract features print "extracting training features..." X_train,global_feat_dict,y_train,train_ids = extract_feats(ffs, trainfile) global_feat_dict_sorted = sorted(global_feat_dict.iteritems(), key=operator.itemgetter(1)) print global_feat_dict_sorted #print X_train.sum(axis=0) #print "1:",X_train[0] #print "2:",X_train[1] #print "3:",X_train[2] print "done extracting training features" print if args.cross_validate: print "running cross-validation tests..." score = crossvalidate.getScore(X_train,y_train, splinalg.lsqr) print "MAE cross validation score:",score print "done cross-validation" else: # write out predictions on test data # train here, and return regression parameters print "learning..." learned_w = splinalg.lsqr(X_train,y_train)[0] print '\n'.join(['%i: %8.8f %s' % (n, learned_w[n], global_feat_dict_sorted[n][0]) for n in xrange(len(learned_w))]) ''' preds = np.absolute(X_train.dot(learned_w)) myfile = open('bb.txt','wb') wr = csv.writer(myfile, dialect='excel') for i in range(len(preds)): wr.writerow([i,X_train[i,0],X_train[i,1], y_train[i], preds[i]]) ''' print "done learning" print # get rid of training data and load test data del X_train del y_train del train_ids print "extracting test features..." X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict) print "done extracting test features" print # make predictions on text data and write them out print "making predictions..." preds = np.absolute(X_test.dot(learned_w)) # blockbuster correction factor for i in range(len(preds)): if X_test[i,1] > 50000000.0: preds[i] *= 0.85 print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
X_sentiment_origin = X_sentiment.copy() X_sentiment = X_sentiment_origin.copy() y_train = y_train_origin.copy() y_train = np.log(y_train) mask = np.array(y_train > 14) y_train = y_train[mask] X_sentiment += 0.1 X_sentiment = np.log(X_sentiment[mask, :]) df = DataFrame(np.concatenate((y_train[:, np.newaxis], X_sentiment), axis=1)) scatter_matrix(df, alpha=0.2, figsize=(15, 15), diagonal='kde') """ """ # TODO train here, and return regression parameters print "learning..." learned_w = splinalg.lsqr(X_train,y_train)[0] print "done learning" print # get rid of training data and load test data del X_train del y_train del train_ids print "extracting test features..." X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict) print "done extracting test features" print
random_state=None, verbose=0, min_density=None, compute_importances=None) erf.fit(X_train, y_train) print "POST-selection oob\t%.4f" % (erf.oob_score_) #print "Test oob\t%.4f" % erf.score(X_test, y_test) pred_selected = erf.predict(X_test) # Output predictions #y_pred = erf.predict(X_test) from sklearn.metrics import accuracy_score print accuracy_score(pred_full, pred_selected) ids = np.load(open('ids', 'rb')) import util util.write_predictions(pred_selected, ids, 'predictions/erf_80var.csv') """ pos = np.arange(sorted_idx.shape[0]) + .5 discard_bottom = 60 pos_plot = pos[discard_bottom:] fi_plot = feature_importance[sorted_idx][discard_bottom:] names_plot = feature_names[sorted_idx][discard_bottom:] pl.subplot(1, 1, 1) pl.barh(pos_plot, fi_plot, align='center') pl.yticks(pos_plot, names_plot) pl.xlabel('Relative Importance') pl.title('Variable Importance RF') pl.show() """
def main(): train_dir = "train" test_dir = "test" outputfile = "sample_predictions.csv" # feel free to change this or take it as an argument # DONE put the names of the feature functions you've defined above in this list # we added all of our features engineering, and we also tried multiple pairing # of system calls, bigrams, trigrams and quadrigrams # We ran each of these separately. The validation accuracy is reporte in # the latex table. # ffs without any grams, only features engineering # ffs = [first_last_system_call_feats, system_call_count_feats, system_load_dll_feats, # system_open_key_feats, system_vm_protect_feats, system_dump_line_feats, # system_delete_file_feats, system_remove_directory_feats, system_create_directory_feats] # bigram ffs ffs = [ first_last_system_call_feats, system_call_count_feats, system_load_dll_feats, system_open_key_feats, system_vm_protect_feats, system_dump_line_feats, system_delete_file_feats, system_remove_directory_feats, system_create_directory_feats, system_bigrams_feats ] # trigrams ffs # ffs = [first_last_system_call_feats, system_call_count_feats, system_load_dll_feats, # system_open_key_feats, system_vm_protect_feats, system_dump_line_feats, # system_delete_file_feats, system_remove_directory_feats, system_create_directory_feats, # system_trigrams_feats] # quadrigrams ffs # ffs = [first_last_system_call_feats, system_call_count_feats, system_load_dll_feats, # system_open_key_feats, system_vm_protect_feats, system_dump_line_feats, # system_delete_file_feats, system_remove_directory_feats, system_create_directory_feats, # system_quadrigrams_feats] # extract features print("extracting training features...") X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) print("done extracting training features") print() # split x_Train into a train (75%) and validation set (25%) a1, a2, a3, a4 = np.split(X_train.todense()[:][:3080], 4) X_train_train = sparse.csr_matrix(np.vstack((a1, a2, a3))) X_train_valid = sparse.csr_matrix(a4) # split t_Train into a train (75%) and validation set (25%) a1, a2, a3, a4 = np.split(t_train[:3080], 4) t_train_train = np.concatenate((a1, a2, a3)) t_train_valid = a4 # As we can see, our validation technique is a bit simplistic. We are only # taking the last 25% chunk of the data, and we know the data is acquired # over a few days so we are validating on the later days. This gives us a # somewhat skewed validation set. That being said, we decide to stick with # it because what really matters in this validation scheme is the overall # hierarchy of which models do better, because we then re-train the best # model on the full training set to make our predictions to Kaggle. # DONE train here, and learn your classification parameters # we train naive bayes, svm, random forest and gradient boosted trees on # all three cases, bigrams, trigrams and quadrigrams print("learning...") # We first start with a multinomial naive bayes classifier, since this is # something we learned with the generative models, and it is a linear # model and thus relatively simple # 1nd - Multinomial Naive Bayes Classifier model_mnb = MultinomialNB() model_mnb.fit(X_train_train, t_train_train) x1 = categ_accuracy(model_mnb, X_train_valid, t_train_valid) print("MNB Classifier Accuracy: " + str(x1)) # the result are not great, so we try a more complex linar model # next, we wanted to try an SVM sinc we started learning about SVM and it # is a slightly more complex and generalizable linear model # 2st - SVM Classifier model_svm = svm.SVC() # SVM Classifier model_svm.fit(X_train_train, t_train_train) x0 = categ_accuracy(model_svm, X_train_valid, t_train_valid) print("SVM Classifier Accuracy: " + str(x0)) # the results are much better, but can still probably be better. We turn to # ensemble methods # next, we wanted to try a random forest classifier as an easier-to-train # non-linear model that worked really well on last practical # 3rd - Random Forest Classifier model_rf = RandomForestClassifier() model_rf.fit(X_train_train, t_train_train) x2 = categ_accuracy(model_rf, X_train_valid, t_train_valid) print("RF Classifier Accuracy: " + str(x2)) # this is quite good! We nonetheless try one last model # finally, we try gradient boosting trees as a supposedly better random forest # 4th - Gradient Boosting Classifier model_gb = GradientBoostingClassifier() model_gb.fit(X_train_train, t_train_train) x3 = categ_accuracy(model_gb, X_train_valid, t_train_valid) print("GB Classifier Accuracy: " + str(x3)) # does about the same as random forest, but not really better. let's see # what happens after doing grid search to optimize parameters. # The best appears to be a random forest, so we will use grid search # to improve the parameters # we save the best parameters here best_model_parameters = None # this is about the score from our random parameters, it will be our benchmark # to improve with grid search best_acc = 0 # zero initial accuracy # range 1 to 260 by steps of 20 for each parameter for depth in [1, 20, 40, 60]: for n_estimators in [ 1, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220 ]: # setting up the model parameters dict model_parameters = { 'max_depth': depth, 'random_state': 0, 'n_estimators': n_estimators } optimized_model_rf = RandomForestClassifier(**model_parameters) # running cross validation for the model with the parameters optimized_model_rf.fit(X_train_train, t_train_train) val_acc = categ_accuracy(optimized_model_rf, X_train_valid, t_train_valid) # updating the best parameters if the avg validation accuracy is better if val_acc > best_acc: best_acc = val_acc best_model_parameters = model_parameters # display the best parameters and the associated accuracy print('Best RF/GB model parameters:', best_model_parameters) print('Best RF Classifier Accuracy:', best_acc) # doing grid search for gradient boosting is too computationally consuming. # However, we know that gradient boosting trees are essentially a boosted # random forest, and we can see that from the fact that their accuracy scores # are very similar. Therefore, we infer that optimal parameters for random # forest will be similar to optimal parameter for gradient boosted trees # and use the same optimal parameters. # train our model with optimal parameters from random forest grid search optimized_model_gb = GradientBoostingClassifier(**best_model_parameters) optimized_model_gb.fit(X_train_train, t_train_train) val_acc_gb = categ_accuracy(optimized_model_gb, X_train_valid, t_train_valid) # display the best accuracy print('Best GB Classifier Accuracy:', val_acc_gb) # As we can see (as listed in the report), we get better accuracy for # a normal random forest classifier for all types of grams, so we simply # use a ranadom forest for the final training and for our submission. # We can also see that bigrams provide the best accuracy, so we stick to # bigrams for our predictions submission. # RF is the highest model we get now, with about 0.87 so we'll train # a RF on the full data # We use random forrest for predictions. Here, we retrain a new random # forest model on the full training data set (so to have a better model than # the one we used to get accuracy) and use that for submission model_rf_all_opt = RandomForestClassifier(**best_model_parameters) model_rf_all_opt.fit(X_train, t_train) print("done learning") print() # get rid of training data and load test data del X_train del t_train del train_ids print("extracting test features...") X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print("done extracting test features") print() # make predictions on text data and write them out print("making predictions...") preds = model_rf_all_opt.predict(X_test) print("done making predictions") print() print("writing predictions...") util.write_predictions(preds, test_ids, outputfile) print("done!")
import math import numpy as np import books import visualize import mf import util import shared_utils as su # do this once to build the ratings and save them to ratings_tuple_std books.build_ratings(filename="ratings_tuple_std", standardize=True, withhold=20000) # load training data data_train = su.unpickle("ratings_tuple_std") # choose a number of features, limit the time the simulation runs K = 5 max_steps = 2 # change this to something reasonable, like 200 or 500 # update this for each trial you do with a particular k run = 0 data_mfact = mf.mfact(data_train["ratings"], data_train["N"], data_train["D"], \ K, steps=max_steps, filename=("mfact_%d_run_%d" % (K, run))) # make some predictions predictions = books.make_predictions(data_train, data_mfact) # write the predictions util.write_predictions(predictions,("predictions_%d_run_%d.csv" % (K, run)))
import numpy as np import ensemble from train_model_library import train_model_library import util import makePred ensemble_library_pred, validation_labels, scaler, model_grid = train_model_library( n_folds_to_compute=1) ensemble, acc, n, c1acc = ensemble.generate_ensemble(ensemble_library_pred, validation_labels, n_init=3, tolerance=.00001) ids, features = util.load_test("kaggle_test_tf_idf_l1_norm.csv") labels = makePred.makePrediction(ensemble, model_grid, features, scaler) util.write_predictions(labels, "idflabels_lean_2.csv") print("done")
tree_raw = np.loadtxt('predictions/syscall_count_by_type-1.csv', dtype=str, delimiter=';') tree_reader = csv.reader(tree_raw, delimiter=',') tree_reader.next() preds = [] ids = [] for row in tree_reader: f_id = row[0] tree_pred = int(row[1]) if PROPS[tree_pred] < PROPS[log_preds[f_id]]: preds.append(tree_pred) else: preds.append(log_preds[f_id]) ids.append(f_id) util.write_predictions(preds, ids, 'predictions/combined-2.csv') '''mat,_,cat = pickle.load(open('matrix_train', 'rb')) mats,cats = extract.split_data(mat,cat,7) correct = 0. for i in range(7): train_mats = [mats[j] for j in range(7) if not i == j] train_cats = [cats[j] for j in range(7) if not i == j] train_mat, train_cat = extract.join_data(train_mats, train_cats) test_mat, test_cat = mats[i], cats[i] clf = tree.DecisionTreeClassifier() clf = clf.fit(train_mat, train_cat) tree_preds = clf.predict(test_mat) logreg = linmod.LogisticRegression()
import numpy as np import util # This is just about the dumbest possible predictor, but it shows the # really basic things you need to know to read in the training data # and write a valid prediction file. pred_filename = 'pred-global-mean.csv' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) # Compute the mean rating. num_train = len(training_data) mean_rating = float(sum(map(lambda x: x['rating'], training_data)))/num_train print "The mean rating is %0.3f." % (mean_rating) # Use the global mean to make predictions. # Iterate over the test set and add a 'rating' dictionary element. for query in test_queries: query['rating'] = mean_rating # Write the prediction file. util.write_predictions(test_queries, pred_filename)
def main(load=False, test=False, both=False): train_dir = "train" test_dir = "test" outputfile = "treepredictions.csv" # feel free to change this or take it as an argument if not load: # extract features print "extracting training features..." X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) print "done extracting training features" print print "Saving features" with open("X_train", "w") as out: pickle.dump(X_train, out) with open("global_feat_dict", "w") as out: pickle.dump(global_feat_dict, out) with open("t_train", "w") as out: pickle.dump(t_train, out) with open("train_ids", "w") as out: pickle.dump(train_ids, out) print "Done saving" else: print "Loading previous features" with open("X_train", "r") as out: X_train = pickle.load(out) with open("global_feat_dict", "r") as out: global_feat_dict = pickle.load(out) with open("t_train", "r") as out: t_train = pickle.load(out) with open("train_ids", "r") as out: train_ids = pickle.load(out) print "Done loading" print # if we're verifying things, save some test data if not test: print "Getting holdout data..." Xs, ts, ids = (X_train, t_train, train_ids) n = Xs.shape[0] train_pct = 0.8 X_train = Xs[-int(n * train_pct):] t_train = ts[-int(n * train_pct):] train_ids = ids[-int(n * train_pct):] X_holdout = Xs[:-int(n * train_pct)] t_holdout = ts[:-int(n * train_pct)] holdout_ids = ids[:-int(n * train_pct)] print # TODO train here, and learn your classification parameters print "learning..." num_trees = 100 forest = RandomForestClassifier(n_estimators=num_trees) forest = forest.fit(X_train.todense(), t_train) # Random forest predictor forest_predictor, _ = sk_random_forest(X_train.toarray(), t_train, num_trees=num_trees) # logistic regression predictor # log_predictor, _ = sk_logistic(X_train, t_train) print "done learning" print # get rid of training data and load test data # del X_train # del t_train # del train_ids # if you want to write predictions for test data if test: # if you didn't save both sets of features, extract if not both: print "extracting test features..." X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print print "Saving test features" with open("X_test", "w") as out: pickle.dump(X_test, out) with open("test_ids", "w") as out: pickle.dump(test_ids, out) print "Done saving" print else: print "Loading previous test features" with open("X_test", "r") as out: X_test = pickle.load(out) with open("test_ids", "r") as out: test_ids = pickle.load(out) print "Done loading" print # TODO make predictions here print "making predictions..." preds = forest.predict(X_test.toarray()) print "done making predictions" print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!" else: error = 0 total = X_holdout.shape[0] print "making predictions..." #preds = np.argmax(X_test.dot(learned_W),axis=1) #preds = logreg.predict(X_test) random.seed(datetime.now()) for index, feats in enumerate(X_holdout.toarray()): pred_forest = forest_predictor(feats) # pred_logistic = log_predictor(feats) # #if they agree, or disagree and both predict malware # if pred_forest == pred_logistic or (pred_forest != 8 and pred_logistic != 8): # prediction = pred_forest # else: # # grab the non-"None" label # other = pred_forest if pred_forest != 8 else pred_logistic # # flip a coin # if random.random() < 0.39: # prediction = 8 # else: # prediction = other prediction = pred_forest if (prediction != t_holdout[index]): print "%s: expected %d but got %d" % ( holdout_ids[index], t_holdout[index], prediction) error += 1 print "Correct: %d, Incorrect: %d, Total: %d, Accuracy: %f" % ( total - error, error, total, (total - error) / (1.0 * total)) print "done making predictions" print print
print "done extracting training features" print # TODO train here, and return regression parameters print "learning..." learned_w = splinalg.lsqr(X_train,y_train)[0] print "done learning" print # get rid of training data and load test data del X_train del y_train del train_ids print "extracting test features..." X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = X_test.dot(learned_w) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!" import pickle pickle.dump(test_ids, open('test_ids.p','wb'))
def GetFitSolutionResults(self, request, context): """ TA2-3 API call """ logging.info("Message received: GetFitSolutionResults") request_id = request.request_id request_params = self._solution_score_map[request_id] start=solutiondescription.compute_timestamp() solution_id = request_params.solution_id if solution_id not in self._solutions: logging.info("GetFitSolutionResults: Solution %s not found!", solution_id) msg = core_pb2.Progress(state=core_pb2.ERRORED, status="", start=start, end=solutiondescription.compute_timestamp()) # Clean up self._solution_score_map.pop(request_id, None) yield core_pb2.GetFitSolutionResultsResponse(progress=msg, steps=[], exposed_outputs=[], fitted_solution_id=None) else: solution = self._solutions[solution_id] msg = core_pb2.Progress(state=core_pb2.RUNNING, status="", start=start, end=solutiondescription.compute_timestamp()) fitted_solution = copy.deepcopy(solution) fitted_solution.id = str(uuid.uuid4()) fitted_solution.create_pipeline_json(self._primitives) self._solutions[fitted_solution.id] = fitted_solution inputs = self._get_inputs(solution.problem, request_params.inputs) try: output = fitted_solution.fit(inputs=inputs, solution_dict=self._solutions) except: logging.info(fitted_solution.primitives) logging.info(sys.exc_info()[0]) output = None result = None outputDir = os.environ['D3MOUTPUTDIR'] if isinstance(output, np.ndarray): output = pd.DataFrame(data=output) if output is not None: uri = util.write_predictions(output, outputDir + "/predictions", fitted_solution) uri = 'file://{uri}'.format(uri=os.path.abspath(uri)) result = value_pb2.Value(csv_uri=uri) else: result = value_pb2.Value(error = value_pb2.ValueError(message="Output is NULL")) yield core_pb2.GetFitSolutionResultsResponse(progress=msg, steps=[], exposed_outputs=[], fitted_solution_id=fitted_solution.id) msg = core_pb2.Progress(state=core_pb2.COMPLETED, status="", start=start, end=solutiondescription.compute_timestamp()) steps = [] for i in range(fitted_solution.num_steps()): steps.append(core_pb2.StepProgress(progress=msg)) exposed_outputs = {} if request_params.expose_outputs is not None and len(request_params.expose_outputs) > 0: last_step_output = request_params.expose_outputs[len(request_params.expose_outputs)-1] else: last_step_output = fitted_solution.outputs[0][2] exposed_outputs[last_step_output] = result # Clean up self._solution_score_map.pop(request_id, None) yield core_pb2.GetFitSolutionResultsResponse(progress=msg, steps=steps, exposed_outputs=exposed_outputs, fitted_solution_id=fitted_solution.id)
bidir=True) if using_GPU: RNNseq_model = RNNseq_model.cuda() state_dict = torch.load(args.rnn_model_path)['state_dict'] else: state_dict = torch.load(args.rnn_model_path, map_location='cpu')['state_dict'] # create new OrderedDict that does not contain `module.` from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v # load params RNNseq_model.load_state_dict(new_state_dict) result = write_predictions(raw_test_rcc, test_dataloader_rcc, RNNseq_model, using_GPU, args.not_found_test_path) logging.info("Write predictions to {}".format(args.not_found_test_path)) f = open(args.not_found_test_path, 'w') writer = csv.writer(f) writer.writerows(result) f.close() logging.info("*" * 25 + " Mention Labeling By LSTM tagging model " + "*" * 25) ############## # classfying # ############## logging.info("*" * 25 + " Dataset Recognition By CNN Text Classifier " + "*" * 25) args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] logging.info("Loading tagFile") idx_to_class = {}
def main(): priors = [ .0369, .0162, .012, .0103, .0133, .0126, .0172, .0133, .5214, .0068, .1756, .0104, .1218, .0191, .013 ] ########################## ####System Counts######### ########################## # define global set for creating data frames # test_tree_list, test_classes, test_ids = extract_tree("test") # globalSetTest = set() # dictListTest = list() # for tree in test_tree_list: # dictListTest.append(perSysCallCount(tree, globalSetTest)) # train_tree_list, train_classes, train_ids = extract_tree("train") # dictListTrain = list() # for tree in train_tree_list: # dictListTrain.append(perSysCallCount(tree, globalSetTest)) # newPerSysCallCountFile(dictListTest,test_classes, test_ids, "perSysCountsTest.csv", globalSetTest) # newCountFile(test_tree_list, test_classes, test_ids, "choppyTest.csv") # del test_tree_list,test_classes,dictListTest,test_ids # newPerSysCallCountFile(dictListTrain,train_classes,train_ids, "perSysCountsTrain.csv",globalSetTest) #newCountFile(train_tree_list, train_classes, train_ids, "choppyTrain.csv") # del train_tree_list,train_classes,train_ids,dictListTrain ############################################### #######Per-Tree, Per-System Call Counts######## ############################################### """ Read in train and test as Pandas DataFrames """ # df_train = pd.read_csv("choppyTrain.csv") # df_test = pd.read_csv("choppyTest.csv") df_train = pd.read_csv("perSysCountsTrain.csv") df_test = pd.read_csv("perSysCountsTest.csv") #store class values Y_train = df_train.Class.values testID = df_test.Id.values #row where testing examples start test_idx = df_train.shape[0] df_all = pd.concat((df_train, df_test), axis=0) del df_train del df_test df_all = df_all.drop(['Id'], axis=1) df_all = df_all.drop(['Class'], axis=1) vals = df_all.values del df_all X_train = vals[:test_idx] X_test = vals[test_idx:] del vals # clf = bnb(class_prior=priors) # clf.fit(X_train, Y_train) clf = mnb(class_prior=priors) clf.fit(X_train, Y_train) del X_train del Y_train # bnb_predict = clf.predict(X_test) mnb_predict = clf.predict(X_test) # util.write_predictions(bnb_predict,test_ids,"ChoppySingleBNB.csv") util.write_predictions(mnb_predict, testID, "PerSysCallCountsBNB.csv")
clf = SGDClassifier(penalty = 'elasticnet', alpha = 0.000001) clf.fit(X, y) print clf.score(X, y) # Output predictions X_test = np.load(open('x_test', 'rb')) from sklearn.preprocessing import StandardScaler X_test = np.log(X_test + 1) X_test = StandardScaler().fit_transform(X_test) y_pred = clf.predict(X_test) ids = np.load(open('ids', 'rb')) import util util.write_predictions(y_pred, ids, 'predictions/rf_pc10.csv') """ from sklearn.ensemble import ExtraTreesClassifier erf = ExtraTreesClassifier(n_estimators=300, max_features='auto', bootstrap=True, oob_score=True, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None) erf.fit(X_train, y_train) print "oob\t%.4f" % (erf.oob_score_) """
def main(): train_dir = "../../../Data/train" test_dir = "../../../Data/test" outputfile = "../../Output/Jeremiah.csv" # feel free to change this or take it as an argument ################################ #### Empirical Summary ################################ #Get types & fre of commands #raw count lesNames = call_freq_emp(train_dir) lesNames_freq = pd.Series(lesNames.values(), lesNames.keys()) lesNames_freq = lesNames_freq/sum(lesNames_freq) lesNames_freq.sort() lesNames_freq #raw count by Type lesNames_byType = call_freq_byType(train_dir) lesNames_byType_freq = dict() for TypeName in util.malware_classes: lesNames_byType_freq[TypeName] = [] for keyName in lesNames_byType.keys(): namez = lesNames_byType[keyName] namez_freq = pd.Series(namez.values(), namez.keys()) namez_freq = namez_freq/sum(namez_freq) namez_freq.sort(ascending = False) lesNames_byType_freq[keyName] = namez_freq[namez_freq > 0.01] print(keyName + " Finished!") sys.stdout.flush() lesNames_byType_freq #most frequent commands in each class ##Bar plot t_label = np.array(util.malware_classes)[np.array(t_train)] bar_df = stats.itemfreq(t_label).T bar_df = stats.itemfreq(t_train).T bar_df = pd.DataFrame(data= bar_df).T bar_df.columns = ['name', 'count'] bar_df[['count']] = bar_df[['count']].astype(int) ggplot(aes(x = "name", weight = "count"), bar_df) + \ xlab("count") + geom_bar() + \ ggtitle("Frequency Count for Malware Types") ################################ #### Feature Extraction and Prunning ################################ # TODO put the names of the feature functions you've defined above in this list ffs = [first_last_system_call_feats, system_call_count_feats, \ call_freq, dll_type]#, get_all_keys] # extract features print "extracting training features..." X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir) X_train_dense = X_train.todense() del X_train print "done extracting training features" print sys.stdout.flush() #prunning X_train_prune, global_feat_dict_prune, featureFreq, prunId = \ pruneFeatures(minFreq = 1, X_train_dense = X_train_dense, \ global_feat_dict = global_feat_dict) ################################ #### CV-based training ################################ n = X_train_dense.shape[0] nForest = 1000 n_cv = 5 print str(n_cv) + " fold learning initiated..." eRate_cv = [] kf_cv = cv.KFold(n, n_folds = n_cv) clf_cv = es.RandomForestClassifier(n_estimators = nForest) i = 0 for train_index, test_index in kf_cv: i += 1 #create CV dataset and fit F_train, F_test = X_train_prune[train_index], X_train_prune[test_index] y_train, y_test = t_train[train_index], t_train[test_index] clf_fit = clf_cv.fit(F_train, y_train) #prediction clf_pred = clf_fit.predict(F_test) accuracy = Accuracy(clf_pred, y_test)[0] eRate_cv.append(accuracy) print("Fold " + str(i) + " Classification Accuracy = " + str(accuracy)) sys.stdout.flush() print "done learning" print np.mean(eRate_cv) ################################ #feature importance assessment: ################################ # train here, and learn your classification parameters print "learning..." nForest = 1000 clf = es.RandomForestClassifier(n_estimators = nForest, \ verbose = 1, n_jobs = -1) clf_fit = clf.fit(X_train_dense, t_train) print "done learning" print #TODO: Figure out param Name that Feature Importance corresponds to ftImp = pd.DataFrame(sorted(global_feat_dict.keys()), \ columns = ["Name"]) ftImp["FeatureImp"] = clf_fit.feature_importances_ ftImp_s = ftImp.sort(columns = "FeatureImp", ascending = False) print_full(ftImp_s) ftImp_s.loc[ ftImp_s['FeatureImp']> 0.000, :] #################################### # in sample prediction and mis-classification rate #################################### print "making in-sample predictions..." clf_preds = clf_fit.predict(X_train_dense) clf_missId = ((clf_preds - t_train) != 0) clf_miss = t_train[(clf_preds - t_train) != 0] rate = 1 - np.mean(clf_missId) #error rate clf_miss = [util.malware_classes[i] for i in clf_miss] stats.itemfreq(clf_miss) print "done making in-sample predictions" # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict) X_test_dense = X_test.todense() X_test_prune = X_test_dense.T[prunId].T print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." clf_preds = clf_fit.predict(X_test_prune) print "done making predictions" print print "writing predictions..." util.write_predictions(clf_preds, test_ids, outputfile) print "done!"
def main(load = False, test=False): train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" # feel free to change this or take it as an argument if not load: # extract features print "extracting training features..." X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir) print "done extracting training features" print print "Saving features" with open("X_train", "w") as out: pickle.dump(X_train, out) with open("global_feat_dict", "w") as out: pickle.dump(global_feat_dict, out) with open("t_train", "w") as out: pickle.dump(t_train, out) with open("train_ids", "w") as out: pickle.dump(train_ids, out) print "Done saving" print else: print "Loading previous features" with open("X_train", "r") as out: X_train = pickle.load(out) with open("global_feat_dict", "r") as out: global_feat_dict = pickle.load(out) with open("t_train", "r") as out: t_train = pickle.load(out) with open("train_ids", "r") as out: train_ids = pickle.load(out) print "Done loading" print # TODO train here, and learn your classification parameters print "learning..." predictor, _ = sk_logistic(X_train, t_train) # distribs = train_generative(X_train, t_train, len(global_feat_dict)) # Start with logistic regression print "done learning" print # get rid of training data and load test data # del X_train # del t_train # del train_ids print "extracting test features..." X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out error = 0 total = X_train.shape[0] print "making predictions..." # preds = np.argmax(X_test.dot(learned_W),axis=1) # preds = gen_classifier(X_train, distribs) # for t_id, p, t in zip(train_ids, preds, t_train): # if (p != t): # print "%s: expected %d but got %d" % (t_id, t, p) # error += 1 if test: preds = [] for x in X_test: preds.append(predictor(x)) else: for index, feats in enumerate(X_train): prediction = predictor(feats) if (prediction != t_train[index]): print "%s: expected %d but got %d" % (train_ids[index], t_train[index], prediction) error += 1 print "Correct: %d, Incorrect: %d, Total: %d, Accuracy: %f" % (total - error, error, total, (total - error) / (1.0 * total)) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
def main(bayesian=False): outputfile = "match_predictions.csv" actualfile = "match_actual.csv" start_date = datetime.datetime(2003, 1, 1) middle_date = datetime.datetime(2012, 1, 1) middle_date_plus_one = datetime.datetime(2013, 1, 1) end_date = datetime.datetime(2014, 1, 1) print "extracting training features..." # bayesian technique uses all dates # for RMSE, we need train and test set if not bayesian: # use middle_date when you desire 2013 and 2014 for the test set X_train, feat_name_to_col_num, y_train, train_ids = extract_feats(start_date, middle_date_plus_one) else: X_train, feat_name_to_col_num, y_train, train_ids = extract_feats(start_date, end_date) print "done extracting training features" print print "learning..." prior_mean = np.zeros(len(feat_name_to_col_num)) prior_cov = np.identity(len(feat_name_to_col_num)) prior_hessian = np.linalg.inv(prior_cov) posterior_mean, posterior_hessian = PoissonRegression.fit_bayes_poisson(y_train, X_train.toarray(), prior_mean, prior_hessian) print "done learning" print "Learned Coeffs: " + str(posterior_mean) print "Learned Correlations: " + str(posterior_hessian) # only need test set when not fully bayesian - i.e. when you want RMSE if not bayesian: del X_train del y_train del train_ids print "extracting test features..." X_test, _, y_ignore, test_ids = extract_feats(end_date, end_date, feat_name_to_col_num=feat_name_to_col_num) print "done extracting test features" print print "making predictions..." if not bayesian: preds = np.exp(X_test.toarray().dot(posterior_mean)) else: preds = np.exp(X_train.toarray().dot(posterior_mean)) print "done making predictions" print print "writing predictions..." if not bayesian: util.write_predictions(preds, test_ids, outputfile) util.write_predictions(y_ignore, test_ids, actualfile) else: util.write_predictions(preds, train_ids, outputfile) util.write_predictions(y_train, train_ids, actualfile) print "done writing" print "RMSE: " + str(mae.rmse()) # get model evidence only when fully bayesian if bayesian: print "Marginal Likelihood: " + str(PoissonRegression.get_model_evidence(posterior_mean, prior_mean, prior_hessian, y_train, X_train.toarray())) print "Feature dictionary: " + str(feat_name_to_col_num) print "P-values: " + str(PoissonRegression.get_pvalues(posterior_mean, posterior_hessian))
def main(): train_dir = "../../../Data/train" test_dir = "../../../Data/test" outputfile = "../../Output/Jeremiah.csv" # feel free to change this or take it as an argument ################################ #### Empirical Summary ################################ #Get types & fre of commands #raw count lesNames = call_freq_emp(train_dir) lesNames_freq = pd.Series(lesNames.values(), lesNames.keys()) lesNames_freq = lesNames_freq / sum(lesNames_freq) lesNames_freq.sort() lesNames_freq #raw count by Type lesNames_byType = call_freq_byType(train_dir) lesNames_byType_freq = dict() for TypeName in util.malware_classes: lesNames_byType_freq[TypeName] = [] for keyName in lesNames_byType.keys(): namez = lesNames_byType[keyName] namez_freq = pd.Series(namez.values(), namez.keys()) namez_freq = namez_freq / sum(namez_freq) namez_freq.sort(ascending=False) lesNames_byType_freq[keyName] = namez_freq[namez_freq > 0.01] print(keyName + " Finished!") sys.stdout.flush() lesNames_byType_freq #most frequent commands in each class ##Bar plot t_label = np.array(util.malware_classes)[np.array(t_train)] bar_df = stats.itemfreq(t_label).T bar_df = stats.itemfreq(t_train).T bar_df = pd.DataFrame(data=bar_df).T bar_df.columns = ['name', 'count'] bar_df[['count']] = bar_df[['count']].astype(int) ggplot(aes(x = "name", weight = "count"), bar_df) + \ xlab("count") + geom_bar() + \ ggtitle("Frequency Count for Malware Types") ################################ #### Feature Extraction and Prunning ################################ # TODO put the names of the feature functions you've defined above in this list ffs = [first_last_system_call_feats, system_call_count_feats, \ call_freq, dll_type]#, get_all_keys] # extract features print "extracting training features..." X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) X_train_dense = X_train.todense() del X_train print "done extracting training features" print sys.stdout.flush() #prunning X_train_prune, global_feat_dict_prune, featureFreq, prunId = \ pruneFeatures(minFreq = 1, X_train_dense = X_train_dense, \ global_feat_dict = global_feat_dict) ################################ #### CV-based training ################################ n = X_train_dense.shape[0] nForest = 1000 n_cv = 5 print str(n_cv) + " fold learning initiated..." eRate_cv = [] kf_cv = cv.KFold(n, n_folds=n_cv) clf_cv = es.RandomForestClassifier(n_estimators=nForest) i = 0 for train_index, test_index in kf_cv: i += 1 #create CV dataset and fit F_train, F_test = X_train_prune[train_index], X_train_prune[test_index] y_train, y_test = t_train[train_index], t_train[test_index] clf_fit = clf_cv.fit(F_train, y_train) #prediction clf_pred = clf_fit.predict(F_test) accuracy = Accuracy(clf_pred, y_test)[0] eRate_cv.append(accuracy) print("Fold " + str(i) + " Classification Accuracy = " + str(accuracy)) sys.stdout.flush() print "done learning" print np.mean(eRate_cv) ################################ #feature importance assessment: ################################ # train here, and learn your classification parameters print "learning..." nForest = 1000 clf = es.RandomForestClassifier(n_estimators = nForest, \ verbose = 1, n_jobs = -1) clf_fit = clf.fit(X_train_dense, t_train) print "done learning" print #TODO: Figure out param Name that Feature Importance corresponds to ftImp = pd.DataFrame(sorted(global_feat_dict.keys()), \ columns = ["Name"]) ftImp["FeatureImp"] = clf_fit.feature_importances_ ftImp_s = ftImp.sort(columns="FeatureImp", ascending=False) print_full(ftImp_s) ftImp_s.loc[ftImp_s['FeatureImp'] > 0.000, :] #################################### # in sample prediction and mis-classification rate #################################### print "making in-sample predictions..." clf_preds = clf_fit.predict(X_train_dense) clf_missId = ((clf_preds - t_train) != 0) clf_miss = t_train[(clf_preds - t_train) != 0] rate = 1 - np.mean(clf_missId) #error rate clf_miss = [util.malware_classes[i] for i in clf_miss] stats.itemfreq(clf_miss) print "done making in-sample predictions" # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) X_test_dense = X_test.todense() X_test_prune = X_test_dense.T[prunId].T print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." clf_preds = clf_fit.predict(X_test_prune) print "done making predictions" print print "writing predictions..." util.write_predictions(clf_preds, test_ids, outputfile) print "done!"
import numpy import util import shared_utils pred_filename = 'pred-user-hamming.csv' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) user_common_books = shared_utils.unpickle('user_common_books') user_difference_ratings = shared_utils.unpickle('user_difference_ratings') print user_common_books[0:] print user_difference_ratings[0:] ratings_filename = 'ratings_std' mother = shared_utils.unpickle(ratings_filename) ''' for query in test_queries: user = query['user'] user_cluster = numpy.dot(R[user - 1,:], range(k)) isbn = query['isbn'] book_index = mother['book_isbn_to_index'][isbn] query['rating'] = U[user_cluster][book_index] * mother['variance'] + mother['mean'] util.write_predictions(test_queries, pred_filename) '''
# Store data for each user to keep track of the per-user average. users = {} for user in user_list: users[user['user']] = { 'total': 0, # For storing the total of ratings. 'count': 0, # For storing the number of ratings. } # Iterate over the training data to compute means. for rating in training_data: user_id = rating['user'] users[user_id]['total'] += rating['rating'] users[user_id]['count'] += 1 # Make predictions for each test query. for query in test_queries: user = users[query['user']] if user['count'] == 0: # Perhaps we did not having any ratings in the training set. # In this case, make a global mean prediction. query['rating'] = mean_rating else: # Predict the average for this user. query['rating'] = float(user['total']) / user['count'] # Write the prediction file. util.write_predictions(test_queries, pred_filename)
""" # Visualize train fit from pandas.tools.plotting import scatter_matrix from pandas import DataFrame df = DataFrame(np.concatenate((y_test[:,np.newaxis], y_hat[:,np.newaxis]), axis=1)) scatter_matrix(df, alpha=0.2, figsize=(15, 15), diagonal='kde') print mean_absolute_error(np.exp(y_test), np.exp(y_hat)) """ #Output predictions y_out = np.exp(y_hat) import util test_ids = pickle.load(open('test_ids.p','rb')) util.write_predictions(y_out, test_ids, outfile) """ ################################## ## ## Model Selection ## ################################## ### OLS Train Scores rss = 0
def main(): train_dir = "train" test_dir = "test" outputfile = "predictions3062019.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [ system_call_termination_reason, system_call_bigrams, system_call_trigrams ] # ffs = [first_last_system_call_feats, system_call_count_feats, system_call_termination_reason, system_call_count_feat_types, system_call_processes, system_call_unsuccessful] # ffs = [system_call_termination_reason] # extract features first = True if first: print "extracting training features..." X_train, global_feat_dict, t_train, train_ids = extract_feats( ffs, train_dir) # print "\n\n\n" # print len(global_feat_dict) print "done extracting training features" print # X_train, X_test, t_train, t_test = train_test_split(X_train, t_train, random_state = 1) # pickle.dump( (X_train, X_test, t_train, t_test, global_feat_dict,t_train,train_ids), open( "save.p", "wb" ) ) else: X_train, X_test, t_train, t_test, global_feat_dict, t_train, train_ids = pickle.load( open("save.p", "rb")) print(len(global_feat_dict)) # TODO train here, and learn your classification parameters print "learning..." print(X_train.shape) # for n in [10, 100, 1000]: # clf = RandomForestClassifier(n_estimators = 100) # clf.fit(X_train,t_train) # print "done learning" # print n # print # print "score" # print clf.score(X_test, t_test) import xgboost clf = RandomForestClassifier(n_estimators=100000) # clf = LinearSVC() # clf = MLPClassifier(hidden_layer_sizes = (80, 80), max_iter=5000, random_state = 1, alpha=0.05) # clf = GaussianNB() clf.fit((X_train), t_train) print "done learning" print # print "score" # print clf.score(X_test, t_test) # get rid of training data and load test data del X_train del t_train del train_ids print "extracting test features..." X_test, _, t_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = clf.predict((X_test)) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, outputfile) print "done!"
for datum in test_data: '''cluster = clusters[users[datum['user']]['index']] sum_ratings = 0. num_ratings = 0 for (u,r) in train_sorted[b_keys[datum['isbn']]]: if (u in cluster): sum_ratings += r num_ratings += 1 if (num_ratings == 0): sum_errors += math.pow(mean_rating - datum['rating'],2) else: sum_errors += math.pow((sum_ratings/num_ratings)-datum['rating'],2)''' sum_errors += math.pow(cluster_avgs[users[datum['user']]['index']] - datum['rating'], 2) print math.sqrt(sum_errors / len(test_data)) '''for query in test_queries: cluster = clusters[users[query['user']]] sum_ratings = 0. num_ratings = 0 for (u,r) in train_sorted[b_keys[query['isbn']]]: if (u in cluster): sum_ratings += r num_ratings += 1 if (num_ratings == 0): query['rating'] = mean_rating else: query['rating'] = sum_ratings / num_ratings # Write the prediction file. util.write_predictions(test_queries, pred_filename)'''
# format test data test_ids = df_test.Id.values df_test = df_test.drop(['Id'], axis=1) X_test = df_test.values print "Train features:", X_train.shape print "Train class:", Y_train.shape print "Test features:", X_test.shape # RandomForestClassifier RF = RandomForestClassifier(n_estimators=100, max_features='log2') RF.fit(X_train, Y_train) RF_pred = RF.predict(X_test) write_predictions(RF_pred, test_ids, 'predicted_RF.csv') # print 'RandomForestClassifier', categorization_accuracy('predicted_RF01.csv', 'actual_small.csv') # 0.891444342226 - n_estimators=100, max_features='None' # 0.894204231831 - n_estimators=50, max_features='log2' # 0.897884084637 - n_estimators=100, max_features='log2' # 0.896964121435 - n_estimators=125, max_features='log2' # 0.896044158234 - n_estimators=115, max_features='log2' # # QuadraticDiscriminantAnalysis # QD = QuadraticDiscriminantAnalysis() # QD.fit(X_train, Y_train) # QD_pred = QD.predict(X_test) # write_predictions(QD_pred, test_ids, 'predicted_QD.csv') # print 'QuadraticDiscriminantAnalysis', categorization_accuracy('predicted_QD.csv', 'actual_small.csv') #