def linear_regression_sklearn(df, xcols): y = df['target_proxy'] X = df[list(xcols)[0]] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) X = np.transpose(np.array([X])) slr = LinearRegression() slr.fit(X, y.values) y_pred = slr.predict(X) print('Slope: %.3f' % slr.coef_[0]) print('Intercept: %.3f' % slr.intercept_) lin_regplot(X, y.values, slr) plt.xlabel('x val') plt.ylabel('Return') plt.tight_layout() plt.savefig(IMG_PATH + 'scikit_lr_fit.png', dpi=300) plt.close() # Closed-form solution Xb = np.hstack((np.ones((X.shape[0], 1)), X)) w = np.zeros(X.shape[1]) z = np.linalg.inv(np.dot(Xb.T, Xb)) w = np.dot(z, np.dot(Xb.T, y)) print('Slope: %.3f' % w[1]) print('Intercept: %.3f' % w[0])
def kfold_cross_validation(df, xcols, folds=10): pdb.set_trace() y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) pipe_lr = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression(random_state=1))]) kfold = StratifiedKFold(y=y_train, n_folds=folds, random_state=1) scores = [] for k, (train, test) in enumerate(kfold): pipe_lr.fit(X_train[train], y_train.values[train]) score = pipe_lr.score(X_train[test], y_train.values[test]) scores.append(score) print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1, np.bincount(y_train.values[train]), score)) print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train.values, cv=10, n_jobs=1) print('CV accuracy scores: %s' % scores) print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
def heat_map(df, xcols): y = df['target'] X = df[list(xcols)] cols = ['target_proxy'] + list(xcols) # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) sns.set(style='whitegrid', context='notebook') sns.pairplot(df[cols], size=2.5) plt.tight_layout() plt.savefig(IMG_PATH + 'corr_mat.png', dpi=300) plt.close() cm = np.corrcoef(df[cols].values.T) sns.set(font_scale=1.5) hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 15}, yticklabels=cols, xticklabels=cols) plt.tight_layout() plt.savefig(IMG_PATH + 'heat_map.png', dpi=300) plt.close()
def lda_scikit(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train, y_train) lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train.values, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() plt.savefig(IMG_PATH + 'lda_scikit.png', dpi=300) plt.close() X_test_lda = lda.transform(X_test) plot_decision_regions(X_test_lda, y_test.values, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() plt.savefig(IMG_PATH + 'lda_scikit_test.png', dpi=300)
def random_forest_feature_importance(df, xcols): y_s = df['target'] x_s = df[list(xcols)] # Standardize and split the training nad test data x_std = standardize(x_s) t_s = 0.3 x_train, x_test, y_train, y_test = train_test_split(x_std, y_s, test_size=t_s, random_state=0) feat_labels = df[list(xcols)].columns forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1) forest.fit(x_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(x_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) plt.title('Feature Importances') plt.bar(range(x_train.shape[1]), importances[indices],color='lightblue', align='center') plt.xticks(range(x_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, x_train.shape[1]]) plt.tight_layout() plt.savefig(IMG_ROOT + 'random_forest_{}.png' "".format(dt.datetime.now().strftime("%Y%m%d")), dpi=300) plt.close() x_selected = forest.transform(x_train, threshold=0.05) print(x_selected.shape) # Shows the percentage of falling into each class print("Class breakdowns: " + str(forest.predict_proba(x_test[0:1]))) print('Training accuracy:', forest.score(x_train, y_train)) print('Test accuracy:', forest.score(x_test, y_test))
def support_vector_machines(df, xcols, C=100): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) t_s = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=t_s, random_state=0) svm = SVC(kernel='linear', C=C, random_state=0) svm.fit(X_train, y_train) print('Training accuracy:', svm.score(X_train, y_train)) print('Test accuracy:', svm.score(X_test, y_test)) # plot_decision_regions(X.values, y.values, classifier=svm, test_break_idx=int(len(y)*(1-ts))) plot_decision_regions(X_std, y.values, classifier=svm) plt.title('Support Vector Machines') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_PATH + 'svm_C' + str(C) + '.png', dpi=300) plt.close() return svm
def adalineGD(df, xcols, eta=0.1, n_iter=10): t0 = time.time() # Need this replace to comply with the -1 and 1 of the perceptron binary classifier y = df['target'].replace(0, -1) X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) ada = AdalineGD(n_iter=15, eta=0.001) ada.fit(X_std, y) plot_decision_regions(X_std, y.values, classifier=ada) plt.title('Adaline - Gradient Descent') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'dow/adaline_2.png', dpi=300) plt.close() plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o') plt.xlabel('Epochs') plt.ylabel('Sum-squared-error') plt.tight_layout() plt.savefig(IMG_ROOT + 'dow/adaline_3.png', dpi=300) plt.close()
def decision_tree(df, xcols, md=3): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) tree = DecisionTreeClassifier(criterion='entropy', max_depth=md, random_state=0) tree.fit(X_train, y_train) print('Training accuracy:', tree.score(X_train, y_train)) print('Test accuracy:', tree.score(X_test, y_test)) plot_decision_regions(X_std, y.values, classifier=tree) plt.title('Decision Tree') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_PATH + 'dec_tree' + '.png', dpi=300) plt.close() export_graphviz(tree, out_file='tree.dot', feature_names=list(xcols))
def k_nearest_neighbors(df, xcols, k=5): """ Run k-nearest neighbors algo """ y = df['target'] X = df[list(xcols)] # Standardize and split the training and test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) knn = KNeighborsClassifier(n_neighbors=k, p=2, metric='minkowski') knn.fit(X_train, y_train) print('Training accuracy:', knn.score(X_train, y_train)) print('Test accuracy:', knn.score(X_test, y_test)) plot_decision_regions(X_std, y.values, classifier=knn) plt.title('Randaom Forest (Decision Tree Ensemble)') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'snp/kmeans/kkn.png', dpi=300) plt.close() return knn
def linear_regressor(df, xcols): y = df['target_proxy'] X = df[list(xcols)[0]] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) lr = LinearRegressionGD() lr.fit(np.transpose(np.array([X_train])), y_train) plt.plot(range(1, lr.n_iter + 1), lr.cost_) plt.ylabel('SSE') plt.xlabel('Epoch') plt.tight_layout() plt.savefig(IMG_PATH + 'cost.png', dpi=300) plt.close() lin_regplot(np.transpose(np.array([X_train])), y_train, lr) plt.savefig(IMG_PATH + 'lin_reg_cost.png', dpi=300) plt.close() # Find the average return of a stock with PE = 20 # Note: will give odd results if x values are standardized and input is not y_val_std = lr.predict([20.0]) print("Estimated Return: %.3f" % y_val_std) print('Slope: %.3f' % lr.w_[1]) print('Intercept: %.3f' % lr.w_[0])
def pml_knn_test(): """ Test our knn vs sklearn """ # Get Data iris = datasets.load_iris() x_vals = iris.data[:, [2, 3]] y_vals = iris.target x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.3, random_state=0) x_train_std = standardize(x_train) x_test_std = standardize(x_test) # x_combined = np.vstack((x_train, x_test)) x_combined_std = np.vstack((x_train_std, x_test_std)) y_combined = np.hstack((y_train, y_test)) iris_data = np.concatenate((x_train_std, np.array([y_train]).T), axis=1) # Sklearn KNN knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') knn.fit(x_train_std, y_train) # x_combined = np.vstack((x_train, x_test)) y_combined = np.hstack((y_train, y_test)) plot_decision_regions(x_combined_std, y_combined, classifier=knn, test_break_idx=range(105, 150)) plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'knn_sklearn.png', dpi=300) plt.close() # Custom KNN cust_knn = KNN(iris_data, k_nbrs=5, dont_div=True) plot_decision_regions(x_combined_std, y_combined, classifier=cust_knn, test_break_idx=range(105, 150)) plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'knn_cust.png', dpi=300) plt.close()
def validation_curves(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve(estimator=pipe_lr, X=X_train, y=y_train, param_name='clf__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='best') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8, 1.0]) plt.tight_layout() plt.savefig(IMG_PATH + 'val_curve.png', dpi=300) plt.close()
def learning_curves(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) train_sizes, train_scores, test_scores = learning_curve( estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='best') plt.ylim([0.8, 1.0]) plt.tight_layout() plt.savefig(IMG_PATH + 'learning_curve.png', dpi=300) plt.close()
def precision_vs_recall(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 # Need this just for specific cases, need postive results to be a value of 1 y = y.map({4: 1, 0: 0}) X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))]) pipe_svc.fit(X_train, y_train) y_pred = pipe_svc.predict(X_test) confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.savefig(IMG_PATH + 'confusion_matrix.png', dpi=300) plt.close() print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred)) print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred)) print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred)) scorer = make_scorer(f1_score, pos_label=1) c_gamma_range = [0.01, 0.1, 1.0, 10.0] param_grid = [{ 'clf__C': c_gamma_range, 'clf__kernel': ['linear'] }, { 'clf__C': c_gamma_range, 'clf__gamma': c_gamma_range, 'clf__kernel': ['rbf'], }] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring=scorer, cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_)
def run_perceptron(train_df, xcols, eta=0.1, n_iter=10): ''' Takes the pruned dataframe and runs it through the perceptron class Parameters ========== df : dataframe dataframe with the inputs and target eta : float learning rate between 0 and 1 n_iter : int passes over the training dataset Return ====== NONE ''' time0 = time.time() # Need this replace to comply with the -1 and 1 of the perceptron binary classifier y_df = train_df['target'].replace(0, -1) x_df = train_df[list(xcols)] # Standardize and split the training nad test data x_std = standardize(x_df) t_s = 0.3 x_train, x_test, y_train, y_test = train_test_split(x_std, y_df, test_size=t_s, random_state=0) plt.figure(figsize=(7, 4)) plt.legend() ppn = Perceptron(eta, n_iter) ppn.fit(x_train, y_train.values) print('Training accuracy:', ppn.score(x_train, y_train)) print('Test accuracy:', ppn.score(x_test, y_test)) pdb.set_trace() plot_decision_regions(x_train, y_train.values, classifier=ppn) # plot_decision_regions(x_df.values, y_df.values, classifier=ppn) plt.xlabel(x_df.columns[0]) plt.ylabel(x_df.columns[1]) plt.savefig(IMG_ROOT + "perceptron_{}.png" "".format(dt.datetime.now().strftime("%Y%m%d"))) plt.close() plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o') plt.xlabel('Iterations') plt.ylabel('Number of misclassifications') plt.savefig(IMG_ROOT + "perceptron_misses_{}.png" "".format(dt.datetime.now().strftime("%Y%m%d"))) plt.close() time1 = time.time() print("Done training data and creating charts, took {0} seconds" "".format(time1 - time0))
def random_forest_regression(df, xcols): y = df['target_proxy'] X = df[list(xcols)[0]] X = np.transpose(np.array([X])) # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) tree = DecisionTreeRegressor(max_depth=3) tree.fit(X, y) sort_idx = X.flatten().argsort() lin_regplot(X[sort_idx], y[sort_idx], tree) plt.xlabel('x-val') plt.ylabel('Return') plt.savefig(IMG_PATH + 'tree_regression.png', dpi=300) plt.close() forest = RandomForestRegressor(n_estimators=1000, criterion='mse', random_state=1, n_jobs=-1) forest.fit(X_train, y_train) y_train_pred = forest.predict(X_train) y_test_pred = forest.predict(X_test) print('MSE train: %.3f, test: %.3f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) plt.scatter(y_train_pred, y_train_pred - y_train, c='black', marker='o', s=35, alpha=0.5, label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', s=35, alpha=0.7, label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.legend(loc='best') plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red') plt.xlim([-10, 50]) plt.tight_layout() plt.savefig(IMG_PATH + 'slr_residuals.png', dpi=300)
def grid_search_analysis(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))]) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] param_grid = [{ 'clf__C': param_range, 'clf__kernel': ['linear'] }, { 'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf'] }] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) clf = gs.best_estimator_ clf.fit(X_train, y_train) print('Test accuracy: %.3f' % clf.score(X_test, y_test)) gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=2) scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), param_grid=[{ 'max_depth': [1, 2, 3, 4, 5, 6, 7, None] }], scoring='accuracy', cv=2) scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
def eval_on_curr_companies(model, df, inputs): pdb.set_trace() df_ind = df[['ticker', 'date', 'month']] df_trimmed = pd.DataFrame(standardize(df[inputs]), columns=inputs) df_combine = pd.concat([df_ind.reset_index(drop=True), df_trimmed], axis=1) predictions = {} for ix, row in df_combine.iterrows(): print(row['ticker'] + " " + row['date'] + " " + str(row['month']), end="") pred = model.predict(row[inputs])[0] try: predictions[pred].append(row['ticker']) except: predictions[pred] = [row['ticker']] print(" Class Prediction: " + str(pred)) return predictions
def run_perceptron_multi(df, xcols, eta=0.1, n_iter=15): time0 = time.time() y = df['target'] X = df[list(xcols)] # Split up the training and test data and standardize inputs # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) # pdb.set_trace() # strong_buy = df[df['target'] == 3][list(X.columns)].values # buy = df[df['target'] == 2][list(X.columns)].values # sell = df[df['target'] == 1][list(X.columns)].values # strong_sell = df[df['target'] == 0][list(X.columns)].values # plt.figure(figsize=(7,4)) # plt.scatter(buy[:, 0], buy[:, 1], color='blue', marker='x', label='Buy') # plt.scatter(sell[:, 0], sell[:, 1], color='red', marker='s', label='Sell') # plt.scatter(strong_buy[:, 0], strong_buy[:, 1], color='blue', marker='*', # label='Strong Buy') # plt.scatter(strong_sell[:, 0], strong_sell[:, 1], color='red', marker='^', # label='Strong Sell') # plt.xlabel(list(X.columns)[0]) # plt.ylabel(list(X.columns)[1]) # plt.legend() ppn = perceptron_skl(n_iter=40, eta0=0.1, random_state=0) ppn.fit(X_train, y_train) y_pred = ppn.predict(X_test) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) plot_decision_regions(X_train, y_train.values, classifier=ppn) plt.savefig(IMG_ROOT + "dow/perceptron_multi.png") plt.close() time1 = time.time() print("Done training data and creating charts, took {0} seconds" "".format(time1 - time0))
def logisticRegression(df, xcols, C=100, penalty='l2'): # Need xcols to be a tuple for the timeme method to work VERY HACKY y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) # Normalization of the data --> max = 1, min=0, etc # mms = MinMaxScaler() # X_train_norm = mms.fit_transform(X_train) # X_test_norm = mms.transform(X_test) # C: regularization parameter, (C = 1/lambda) # smaller C = more regulatiazion, smaller wieghts, higher C = less regularization, lareger weights # penalty: type of regulatizaion function used for weight shrinkage / decay to prevent overfitting lr = LogisticRegression(C=C, random_state=0, penalty=penalty) lr.fit(X_train, y_train) # Shows the percentage of falling into each class print("Class breakdowns: " + str(lr.predict_proba(X_test[0:1]))) print('Training accuracy:', lr.score(X_train, y_train)) print('Test accuracy:', lr.score(X_test, y_test)) print("y-intercept:" + str(lr.intercept_)) print("coeffs:" + str(lr.coef_)) try: plot_decision_regions(X_train, y_train.values, classifier=lr) plt.title('Logistic Regression') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'dow/log_reg_1.png', dpi=300) plt.close() except Exception as e: print("May have more than 2 variables") return lr
def ransac(df, xcols): # function to deal with outliers y = df['target_proxy'] X = df[list(xcols)[0]] X = np.transpose(np.array([X])) # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) ransac = RANSACRegressor( LinearRegression(), max_trials=100, min_samples=50, residual_metric=lambda x: np.sum(np.abs(x), axis=1), residual_threshold=5.0, random_state=0) ransac.fit(X, y) inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers') plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers') plt.plot(line_X, line_y_ransac, color='red') plt.xlabel('x-val') plt.ylabel('Returns') plt.legend(loc='best') plt.tight_layout() plt.savefig(IMG_PATH + 'ransac_fit.png', dpi=300) plt.close()
def principal_component_analysis(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) cov_mat = np.cov(X_train.T) eigen_vals, eigen_vecs = np.linalg.eig(cov_mat) print('Eigenvalues \n%s' % eigen_vals) tot = sum(eigen_vals) var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)] cum_var_exp = np.cumsum(var_exp) plt.bar(range(1, 14), var_exp, alpha=0.5, align='center', label='individual explained variance') plt.step(range(1, 14), cum_var_exp, where='mid', label='cumulative explained variance') plt.ylabel('Explained variance ratio') plt.xlabel('Principal components') plt.legend(loc='best') plt.tight_layout() plt.savefig(IMG_PATH + 'pca1.png', dpi=300) plt.close() # plt.show() eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))] eigen_pairs.sort(reverse=True) w = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis])) # print('Matrix W:\n', w) X_train_pca = X_train.dot(w) colors = ['r', 'b', 'g'] markers = ['s', 'x', 'o'] for l, c, m in zip(np.unique(y_train), colors, markers): plt.scatter(X_train_pca[y_train.values==l, 0], X_train_pca[y_train.values==l, 1], c=c, label=l, marker=m) plt.xlabel('PC 1') plt.ylabel('PC 2') plt.legend(loc='lower left') plt.tight_layout() plt.savefig(IMG_PATH + 'pca2.png', dpi=300)
def polynomial_regression(df, xcols): y = df['target_proxy'] X = df[list(xcols)[0]] X = np.transpose(np.array([X])) # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) lr = LinearRegression() pr = LinearRegression() quadratic = PolynomialFeatures(degree=2) X_quad = quadratic.fit_transform(X) # fit linear features lr.fit(X, y) X_fit = np.arange(-2, 50, 1)[:, np.newaxis] y_lin_fit = lr.predict(X_fit) # fit quadratic features pr.fit(X_quad, y) y_quad_fit = pr.predict(quadratic.fit_transform(X_fit)) # plot results plt.scatter(X, y.values, label='training points') plt.plot(X_fit, y_lin_fit, label='linear fit', linestyle='--') plt.plot(X_fit, y_quad_fit, label='quadratic fit') plt.legend(loc='best') plt.tight_layout() plt.savefig(IMG_PATH + 'poly_regression.png', dpi=300) plt.close() y_lin_pred = lr.predict(X) y_quad_pred = pr.predict(X_quad) print('Training MSE linear: %.3f, quadratic: %.3f' % (mean_squared_error( y, y_lin_pred), mean_squared_error(y, y_quad_pred))) print('Training R^2 linear: %.3f, quadratic: %.3f' % (r2_score(y, y_lin_pred), r2_score(y, y_quad_pred)))
def logistic_regression_feature_importance(df, xcols, C=100, penalty='l2'): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) feat_labels = df[list(xcols)].columns lr = LogisticRegression(C=C, random_state=0, penalty=penalty) lr.fit(X_train, y_train) importances = lr.coef_[0] indices = np.argsort(abs(importances))[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) plt.title('Feature Importances') plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center') plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, X_train.shape[1]]) plt.tight_layout() plt.savefig(IMG_ROOT + 'snp/logistic_regression_feat.png', dpi=300) plt.close() X_selected = lr.transform(X_train, threshold=0.05) print(X_selected.shape) # Shows the percentage of falling into each class print("Class breakdowns: " + str(lr.predict_proba(X_test[0:1]))) print('Training accuracy:', lr.score(X_train, y_train)) print('Test accuracy:', lr.score(X_test, y_test)) print("y-intercept:" + str(lr.intercept_)) print("coeffs:" + str(lr.coef_))
def random_forest(df, xcols, estimators=5): """ Run random forest algorithm """ y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) t_s = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=t_s, random_state=0) forest = RandomForestClassifier(criterion='entropy', n_estimators=estimators, random_state=1, n_jobs=3) forest.fit(X_train, y_train) # Shows the percentage of falling into each class print("Class breakdowns: " + str(forest.predict_proba(X_test[0:1]))) print('Training accuracy:', forest.score(X_train, y_train)) print('Test accuracy:', forest.score(X_test, y_test)) print("Feature Importances :" + str(forest.feature_importances_)) pdb.set_trace() plot_decision_regions(X_std, y.values, classifier=forest) plt.title('Randaom Forest (Decision Tree Ensemble)') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'snp/kmeans/random_forest.png', dpi=300) plt.close()
def nonlinear_svm(df, xcols, C=100, gamma=0.10): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) svm = SVC(kernel='rbf', random_state=0, gamma=gamma, C=C) svm.fit(X_train, y_train) print('Training accuracy:', svm.score(X_train, y_train)) print('Test accuracy:', svm.score(X_test, y_test)) plot_decision_regions(X_std, y.values, classifier=svm) plt.title('Support Vector Machines - Non Linear') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_PATH + 'svm_nonlinear_C' + str(C) + '.png', dpi=300) plt.close()
def linear_discriminant_analysis(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) np.set_printoptions(precision=4) mean_vecs = [] y_set = list(y.unique()) for label in y_set: mean_vecs.append(np.mean(X_train[y_train.values==label], axis=0)) # print('MV %s: %s\n' %(label, mean_vecs[label-1])) d = len(xcols) # number of features S_W = np.zeros((d, d)) for label,mv in zip(y_set, mean_vecs): class_scatter = np.zeros((d, d)) # scatter matrix for each class for row in X_train[y_train.values == label]: row, mv = row.reshape(d, 1), mv.reshape(d, 1) # make column vectors class_scatter += (row-mv).dot((row-mv).T) S_W += class_scatter # sum class scatter matrices print('Within-class scatter matrix: %s' % (S_W)) print('Class label distribution: %s' % np.bincount(y_train)) S_W = np.zeros((d, d)) for label,mv in zip(y_set, mean_vecs): class_scatter = np.cov(X_train[y_train.values==label].T) S_W += class_scatter print('Scaled within-class scatter matrix: %s' % (S_W)) mean_overall = np.mean(X_train, axis=0) d = len(xcols) # number of features S_B = np.zeros((d, d)) for i,mean_vec in enumerate(mean_vecs): n = X_train[y_train==i+1, :].shape[0] mean_vec = mean_vec.reshape(d, 1) # make column vector mean_overall = mean_overall.reshape(d, 1) # make column vector S_B += n * (mean_vec - mean_overall).dot((mean_vec - mean_overall).T) print('Between-class scatter matrix: %s' % (S_B)) eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B)) eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))] # Sort the (eigenvalue, eigenvector) tuples from high to low eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True) # Visually confirm that the list is correctly sorted by decreasing eigenvalues print('Eigenvalues in decreasing order:\\n') for eigen_val in eigen_pairs: print(eigen_val[0]) tot = sum(eigen_vals.real) discr = [(i / tot) for i in sorted(eigen_vals.real, reverse=True)] cum_discr = np.cumsum(discr) plt.bar(range(0, d), discr, alpha=0.5, align='center', label='individual \"discriminability\"') plt.step(range(0, d), cum_discr, where='mid', label='cumulative \"discriminability\"') plt.ylabel('\"discriminability\" ratio') plt.xlabel('Linear Discriminants') plt.ylim([-0.1, 1.1]) plt.legend(loc='best') plt.tight_layout() plt.savefig(IMG_PATH + 'lda1.png', dpi=300) plt.close() w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real, eigen_pairs[1][1][:, np.newaxis].real)) print('Matrix W:\\n', w) X_train_lda = X_train.dot(w) colors = ['r', 'b', 'g'] markers = ['s', 'x', 'o'] for l, c, m in zip(np.unique(y_train), colors, markers): plt.scatter(X_train_lda[y_train.values==l, 0] * (-1), X_train_lda[y_train.values==l, 1] * (-1), c=c, label=l, marker=m) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower right') plt.tight_layout() plt.savefig(IMG_PATH + 'lda2.png', dpi=300)
def sbs_run(train_df, xcols, k_feats=1, est=KNeighborsClassifier(n_neighbors=3), test=pd.DataFrame(), name=None): """ Starting from the full set, sequentially remove the feature ЁЭСе that least reduces (or most increases) the value of the predictive score k_feats = number of chosen columns est = is the learning algorithm used to rank the features """ y_val = train_df['target'] x_val = train_df[list(xcols)] # Standardize and split the training and test data x_std = standardize(x_val) if test.empty: test_sz = 0.3 x_train, x_test, y_train, y_test = train_test_split(x_std, y_val, test_size=test_sz, random_state=0) else: x_train = x_std y_train = train_df['target'] test = test[list(xcols)] x_test = standardize(test) y_test = test['target'] # selecting features sbs = SBS(est, k_features=k_feats) sbs.fit(x_train, y_train) order = [] if k_feats == 1: print("Removed Order, first to last: " "" + str(list(x_val.columns[sbs.removed_order + list(sbs.subsets_[-1])]))) order = list(x_val.columns[sbs.removed_order + list(sbs.subsets_[-1])])[::-1] else: print("Removed Order, first to last:" + str(list(x_val.columns[sbs.removed_order]))) print("Chosen columns: " + str(list(x_val.columns[list(sbs.subsets_[-1])]))) # plotting performance of feature subsets # This will chart the accuracy of each model as we remove features k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.0, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.tight_layout() dt_time = dt.datetime.now().strftime("%Y%m%d_%H_%M") plt.savefig(IMG_ROOT + 'sbs_{}_{}.png'.format(name, dt_time), dpi=300) plt.close() # Training and test accuracy with all variables ks5 = list(sbs.subsets_[-1]) est.fit(x_train, y_train) print("With all variables:") print('Training accuracy:', est.score(x_train, y_train)) print('Test accuracy:', est.score(x_test, y_test)) # Training and test accuracy with only chosen variables for model print("With only chosen (no:{}) variables:".format(k_feats)) est.fit(x_train[:, ks5], y_train) print('Training accuracy:', est.score(x_train[:, ks5], y_train)) print('Test accuracy:', est.score(x_test[:, ks5], y_test)) return order
def adaboost(df, xcols): y = df['target'] X = df[list(xcols)] # Need this just for specific cases, need postive results to be a value of 1 y = y.map({4: 1, 0: 0}) # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) tree = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0) ada = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=0) tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) ada = ada.fit(X_train, y_train) y_train_pred = ada.predict(X_train) y_test_pred = ada.predict(X_test) ada_train = accuracy_score(y_train, y_train_pred) ada_test = accuracy_score(y_test, y_test_pred) print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test)) x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1 y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3)) for idx, clf, tt in zip([0, 1], [tree, ada], ['Decision Tree', 'AdaBoost']): clf.fit(X_train, y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx].contourf(xx, yy, Z, alpha=0.3) axarr[idx].scatter(X_train[y_train.values == 0, 0], X_train[y_train.values == 0, 1], c='blue', marker='^') axarr[idx].scatter(X_train[y_train.values == 1, 0], X_train[y_train.values == 1, 1], c='red', marker='o') axarr[idx].set_title(tt) axarr[0].set_ylabel(xcols[0], fontsize=12) plt.text(10.2, -1.2, s=xcols[1], ha='center', va='center', fontsize=12) plt.tight_layout() plt.savefig(IMG_PATH + 'adaboost.png', bbox_inches='tight', dpi=300)
def nonlinear(df, xcols): y = df['target_proxy'] X = df[list(xcols)[0]] X = np.transpose(np.array([X])) # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) regr = LinearRegression() # create quadratic features quadratic = PolynomialFeatures(degree=2) cubic = PolynomialFeatures(degree=3) X_quad = quadratic.fit_transform(X) X_cubic = cubic.fit_transform(X) # fit features X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis] regr = regr.fit(X, y) y_lin_fit = regr.predict(X_fit) linear_r2 = r2_score(y, regr.predict(X)) regr = regr.fit(X_quad, y) y_quad_fit = regr.predict(quadratic.fit_transform(X_fit)) quadratic_r2 = r2_score(y, regr.predict(X_quad)) regr = regr.fit(X_cubic, y) y_cubic_fit = regr.predict(cubic.fit_transform(X_fit)) cubic_r2 = r2_score(y, regr.predict(X_cubic)) # plot results plt.scatter(X, y, label='training points', color='lightgray') plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2, linestyle=':') plt.plot(X_fit, y_quad_fit, label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2, color='red', lw=2, linestyle='-') plt.plot(X_fit, y_cubic_fit, label='cubic (d=3), $R^2=%.2f$' % cubic_r2, color='green', lw=2, linestyle='--') plt.xlabel('x-val') plt.ylabel('Return') plt.legend(loc='best') plt.tight_layout() plt.savefig(IMG_PATH + 'nonlinear_regr.png', dpi=300) plt.close() pdb.set_trace() # transform features X_log = np.log(X) y_sqrt = np.sqrt(y) # fit features X_fit = np.arange(X_log.min() - 1, X_log.max() + 1, 1)[:, np.newaxis] regr = regr.fit(X_log, y_sqrt) y_lin_fit = regr.predict(X_fit) linear_r2 = r2_score(y_sqrt, regr.predict(X_log)) # plot results plt.scatter(X_log, y_sqrt, label='training points', color='lightgray') plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2) plt.xlabel('x-val') plt.ylabel('Return') plt.legend(loc='best') plt.tight_layout() plt.savefig(IMG_PATH + 'sqrt_log.png', dpi=300)