def logistic_regression(self, use_glm=True): """ (b) it seems the statistical significant predict variable is only Lag2. How disappointing... """ formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume" model = ( smf.glm(formula, data=self.df, family=sm.families.Binomial()) if use_glm else smf.logit(formula, data=self.transformedDF) ) result = model.fit() if use_glm: probs = result.fittedvalues """Beware the prob here is the index 0's prob, so we should use the lambda function below""" pred_values = probs.map(lambda x: 0 if x > 0.5 else 1) else: """The probability of being 1""" probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]]))) pred_values = probs.map(lambda x: 1 if x > 0.5 else 0) """ (c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%. Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%. Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%. """ tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
def knn_fit(self, n_neighbors): weights = 'uniform' #weights = 'distance' clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(self.train_X, self.train_y) test_X = self.test_set[self.x_cols].values test_y = self.test_set[self.y_col].values preds = clf.predict(test_X) tp.output_table(preds, test_y)
def lda_predict(self, fit_res, threshold=0.5): test_X = self.test_set[self.x_cols].values test_y = self.test_set[self.y_col].values if threshold == 0.5: pred_y = fit_res.predict(test_X) else: pred_y_probs = fit_res.predict_proba(test_X) pred_y = np.array([fit_res.classes_[0] if pred_y_probs[i, 0] > threshold else fit_res.classes_[1] for i in xrange(pred_y_probs.shape[0])]) tp.output_table(pred_y, test_y)
def fit_with_knn(self, n_neighbors): train_X = pp.scale(self.train_set[self.x_cols].values.astype(float)) train_y = self.train_set[self.y_col] weights = 'uniform' #weights = 'distance' clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(train_X, train_y) test_X = pp.scale(self.test_set[self.x_cols].values.astype(float)) test_y = self.test_set[self.y_col].values preds = clf.predict(test_X) tp.output_table(preds, test_y)
def test_all_methods(self): x_cols = ["Lag2"] formula = "Direction~Lag2" # print self.df.shape[0] train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :] # print train_data.shape[0] """ (d) logistic""" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() test_data = self.df.ix[self.df["Year"] > 2008, :] probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]]))) pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up") tp.output_table(pred_values.values, test_data[self.y_col].values) train_X = train_data[x_cols].values train_y = train_data[self.y_col].values test_X = test_data[x_cols].values test_y = test_data[self.y_col].values """ (e) LDA """ lda_res = LDA().fit(train_X, train_y) pred_y = lda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (f) QDA """ qda_res = QDA().fit(train_X, train_y) pred_y = qda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (g) KNN """ clf = neighbors.KNeighborsClassifier(1, weights="uniform") clf.fit(train_X, train_y) pred_y = clf.predict(test_X) tp.output_table(pred_y, test_y) """ (h) logistic and LDA """ """ (i) Is the purpose of the last question going through all methods with no direction?"""
def logistic_fit(self): model = smf.logit("%s~%s" % (self.y_col, "+".join(self.x_cols)), data=self.train_set) logistic_res = model.fit() prob_y = logistic_res.predict(self.test_X) pred_y = Series(prob_y).map(lambda x: 1 if x > 0.5 else 0) tp.output_table(pred_y, self.test_y.values)
def qda_fit(self): qda_res = QDA().fit(self.train_X.values, self.train_y.values) pred_y = qda_res.predict(self.test_X.values) tp.output_table(pred_y, self.test_y.values)