def main(): """ Fit models and make predictions. We'll use one-hot encoding to transform our categorical features into binary features. y and X will be numpy array objects. """ model = linear_model.LogisticRegression(C=3) # the classifier we'll use # === load data in memory === # print "loading data" cwd = os.getcwd() trainDataLoc = cwd + '/../data/train.csv' testDataLoc = cwd + '/../data/test.csv' y, X = load_data(trainDataLoc) y_test, X_test = load_data(testDataLoc, use_labels=False) # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) # if you want to create new features, you'll need to compute them # before the encoding, and append them to your dataset after # === training & metrics === # mean_auc = 0.0 n = 10 # repeat the CV procedure 10 times to get more precise results for i in range(n): # for each iteration, randomly hold out 20% of the data as CV set X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state=i * SEED) # if you want to perform feature selection / hyperparameter # optimization, this is where you want to do it # train model and make predictions model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:, 1] # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) mean_auc += roc_auc print "Mean AUC: %f" % (mean_auc / n) # === Predictions === # # When making predictions, retrain the model on the whole training set model.fit(X, y) preds = model.predict_proba(X_test)[:, 1] #filename = raw_input("Enter name for submission file: ") filename = 'LogisticRegressionResults' save_results(preds, filename + ".csv")
def main(): cwd = os.getcwd() trainDataLoc = cwd + '/../data/train.csv' testDataLoc = cwd + '/../data/test.csv' y, X = load_data(trainDataLoc) y_test, X_test = load_data(testDataLoc, use_labels=False) encoder = preprocessing.OneHotEncoder() encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) #model = findBestModel(X, y) Best model is rbf, gamma = 1, c = 1 X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state=SEED) model = svm.SVC(C=1, probability=True, kernel='rbf', gamma=1) model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:, 1] # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC : %f" % (roc_auc) preds = model.predict_proba(X_test)[:, 1] save_results(preds, "SVM_classifier.csv")
def main(): filesToEnsemble = [ 'SVM_classifier.csv', 'output.csv', 'XGBoost_classifier.csv' ] #, 'LogisticRegressionResults.csv','logistic_regression_pred.csv']#, 'sampleSubmission.csv'] fdata = [] itemSum = [] for i in range(numItems): itemSum.append((i, 0)) cwd = os.getcwd() for item in filesToEnsemble: item = cwd + '/../output/' + item fdict = {} f = open(item, 'r') first = True for line in f: if not first: contents = line.split(',', 2) contents[1] = contents[1].strip('\n') fdict[int(contents[0])] = float(contents[1]) else: first = False continue #fList is a list of (row, probability) tuples fList = sorted(fdict.items(), key=lambda t: t[1]) for i in range(len(fList)): fList[i] = (fList[i][0], i) # fList is now a list of (row, rank) fdata.append(fList) for l in fdata: for item in l: row = item[0] itemSum[row - 1] = (itemSum[row - 1][0], itemSum[row - 1][1] + item[1]) reSorted = sorted(itemSum, key=lambda t: t[1]) for i in range(len(reSorted)): reSorted[i] = (reSorted[i][0] + 1, float(i) / numItems) backToNormal = sorted(reSorted, key=lambda t: t[0]) save_results(map(lambda x: x[1], backToNormal), 'Ensembled_Results.csv')
def main(): cwd = os.getcwd() trainDataLoc = cwd + '/../data/train.csv' testDataLoc = cwd + '/../data/test.csv' y, X = load_data(trainDataLoc) y_test, X_test = load_data(testDataLoc, use_labels=False) clf = xgb.XGBClassifier(max_depth=15, n_estimators=200, learning_rate=.4, colsample_bytree=.8, seed=SEED) # fitting clf.fit(X, y, early_stopping_rounds=100, eval_metric="logloss", eval_set=[(X_test, y_test)]) #print y_pred preds = clf.predict_proba(X_test)[:,1] save_results(preds, "XGBoost_classifier.csv")
def main(): filesToEnsemble = ['SVM_classifier.csv', 'output.csv', 'XGBoost_classifier.csv']#, 'LogisticRegressionResults.csv','logistic_regression_pred.csv']#, 'sampleSubmission.csv'] fdata = [] itemSum = [] for i in range(numItems): itemSum.append((i,0)) cwd = os.getcwd() for item in filesToEnsemble: item = cwd + '/../output/' + item fdict = {} f = open(item, 'r') first = True for line in f: if not first: contents = line.split(',', 2) contents[1] = contents[1].strip('\n') fdict[int(contents[0])] = float(contents[1]) else: first = False continue #fList is a list of (row, probability) tuples fList = sorted(fdict.items(), key=lambda t: t[1]) for i in range(len(fList)): fList[i] = (fList[i][0], i) # fList is now a list of (row, rank) fdata.append(fList) for l in fdata: for item in l: row = item[0] itemSum[row-1] = (itemSum[row-1][0], itemSum[row-1][1] + item[1]) reSorted = sorted(itemSum, key=lambda t: t[1]) for i in range(len(reSorted)): reSorted[i] = (reSorted[i][0] + 1, float(i) / numItems) backToNormal = sorted(reSorted, key=lambda t: t[0]) save_results(map(lambda x: x[1], backToNormal), 'Ensembled_Results.csv')
encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) print("about to classify") clf = AdaBoostClassifier(base_estimator=None, n_estimators=900, learning_rate=1.8) scores = clf.fit(X, y) # """ # X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=SEED) # model = svm.SVC(C=1, probability=True, kernel='rbf') # model.fit(X_train, y_train) # preds = model.predict_proba(X_cv)[:, 1] # # compute AUC metric for this CV fold # fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) # roc_auc = metrics.auc(fpr, tpr) # print "AUC : %f" % (roc_auc) # """ prediction = scores.predict_proba(X_test)[:, 1] save_results(predictions, 'AdaBoost_output.csv') print("done") # score = cross_val_score(clf, Matrix, salary) # print score # print score.mean()
encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) print("about to classify") clf = AdaBoostClassifier(base_estimator=None, n_estimators=900, learning_rate=1.8) scores = clf.fit(X, y) # """ # X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=SEED) # model = svm.SVC(C=1, probability=True, kernel='rbf') # model.fit(X_train, y_train) # preds = model.predict_proba(X_cv)[:, 1] # # compute AUC metric for this CV fold # fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) # roc_auc = metrics.auc(fpr, tpr) # print "AUC : %f" % (roc_auc) # """ prediction = scores.predict_proba(X_test)[:, 1] save_results(predictions,'AdaBoost_output.csv') print ("done") # score = cross_val_score(clf, Matrix, salary) # print score # print score.mean()
def setup_argparse(): parser = argparse.ArgumentParser(description='GitHub repository file vulnerability finder') # Add the CL args parser.add_argument('-u', '--username', required=True, help='GitHub username') parser.add_argument('-r', '--repo', help='Repository name') parser.add_argument('-s', '--save', help='File name to which output will be saved (within output/ dir). If not provided, results only displayed on console.') parser.add_argument('-t', '--token', help='Github API Token') parser.add_argument('--entropy', help='Entropy threshold value, [0.0-1.0] default 0.45', type=float) # Check for these vulnerabilities vuln_group = parser.add_argument_group('Vulnerability types') vuln_group.add_argument('--api', help='Look for API keys', action='store_true') vuln_group.add_argument('-p', '--password', help='Look for passwords', action='store_true') vuln_group.add_argument('-e', '--email', help='Look for email addresses', action='store_true') vuln_group.add_argument('-b', '--bitcoin', help='Look for bitcoin', action='store_true') vuln_group.add_argument('-c', '--crypto', help='Look for cryptographic keys', action='store_true') args = parser.parse_args() # Validate entropy argument if args.entropy: if float(args.entropy) > 1 or float(args.entropy) < 0: raise argparse.ArgumentTypeError('Entropy value is not between 0 and 1') if args.api or args.password or args.email or args.bitcoin or args.crypto: types = set() if args.api: types.add('API') if args.password: types.add('Password') if args.email: types.add('Email') if args.crypto: types.add('Crypto') else: types = {'API', 'Password', 'Email', 'Crypto'} if args.token: headers = {'Authorization': f'token {args.token}'} else: headers = {} if args.repo: # Repo name provided print(f'Scraping {args.repo} repository') repo_files = RepoProcessing.get_repo_files(args.username, args.repo, headers) print("Recieved files from " +str(args.username) +"/"+ str(args.repo)) if args.entropy: v = find_vulnerabilities(repo_files, args.entropy) else: v = find_vulnerabilities(repo_files) display_results(v, types) if args.save: save_results(v, args.save) else: print(f'Scraping repositories for user {args.username}') repo_names = RepoProcessing.get_user_repos(args.username, headers) print('Repository names:') for i in range(len(repo_names)): print(f'{i + 1}. {repo_names[i]}') print(f'{len(repo_names) + 1}. All repositories') repo_num = int(input(f'Enter a repository number to scrape ({len(repo_names) + 1} for all): ')) - 1 if repo_num == len(repo_names): print('Scraping all repositories') all_files = RepoProcessing.get_all_files_for_user(args.username, headers) if args.entropy: v = find_vulnerabilities(all_files, args.entropy) else: v = find_vulnerabilities(all_files) display_results(v, types) else: print(f'Scraping {repo_names[repo_num]} repository') repo_files = RepoProcessing.get_repo_files(args.username, repo_names[repo_num], headers) print("Received files from " +str(args.username) +"/"+ str(repo_names[repo_num])) if args.entropy: v = find_vulnerabilities(repo_files, args.entropy) else: v = find_vulnerabilities(repo_files) display_results(v, types)