class knn_classifier(Classifier): def __init__(self,ticker,inputSize=5, binary=True, n_neighbors=15, risk=0.5, adaboost=False): self.type = 'KNN' self.ticker=ticker self.days=inputSize self.inputSize = inputSize self.binary=binary self.risk_thresh = risk self.adaboost = adaboost if binary: self.clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance') else: self.clf = neighbors.KNeighborsRegressor(n_neighbors, weights='distance') if adaboost: self.clf = AdaBoostRegressor(base_estimator=self.clf, n_estimators=100) def predict(self, inputArray): inputArray = np.array(inputArray) inputArray.reshape([1,-1]) if self.binary: pred = self.clf.predict_proba(inputArray) pred = (np.array(pred)[:,1] > self.risk_thresh)*1 else: pred = self.clf.predict(inputArray) return pred def fit(self, X, Y): self.clf.fit(X,Y)
class gaussProcess_classifier(Classifier): def __init__(self, ticker, inputSize=5, binary=True, risk=0.5, numTrainDays=300, adaboost=False): self.type = 'Gaussian Process' self.ticker = ticker self.days = inputSize self.inputSize = inputSize self.binary = binary self.risk_thresh = risk self.adaboost = adaboost self.numTrainDays = numTrainDays if binary: self.clf = GaussianProcessClassifier() else: self.clf = GaussianProcessRegressor() if adaboost: self.clf = AdaBoostRegressor(base_estimator=self.clf, n_estimators=100) def trainClf(self, endDay=date.today(), numTrainDays=100): X, Y = self.processData(endDay, self.numTrainDays) self.fit(X, Y) def predict(self, inputArray): inputArray = np.array(inputArray) inputArray.reshape([1, -1]) if self.binary: pred = self.clf.predict_proba(inputArray) pred = (np.array(pred)[:, 1] > self.risk_thresh) * 1 else: pred = self.clf.predict(inputArray) return pred def fit(self, X, Y): self.clf.fit(X, Y)
class dt_class(Classifier): def __init__(self, ticker, inputSize=5, binary=True, risk=0.5, adaboost=False): self.type = 'Decision Tree' self.ticker = ticker self.days = inputSize self.inputSize = inputSize self.binary = binary self.adaboost = adaboost self.risk_thresh = risk if binary: self.clf = tree.DecisionTreeClassifier(max_depth=inputSize) if adaboost: self.clf = AdaBoostClassifier(base_estimator=self.clf, n_estimators=100) else: self.clf = tree.DecisionTreeRegressor(max_depth=inputSize) if adaboost: self.clf = AdaBoostRegressor(base_estimator=self.clf, n_estimators=100) def predict(self, inputArray): inputArray = np.array(inputArray) inputArray.reshape([1, -1]) if self.binary: pred = self.clf.predict_proba(inputArray) pred = (np.array(pred)[:, 1] > self.risk_thresh) * 1 else: pred = self.clf.predict(inputArray) return pred def fit(self, X, Y): self.clf.fit(X, Y)
class svm_class(Classifier): def __init__(self,ticker,inputSize=5, binary=True, risk=0.5, adaboost=False): self.type = 'SVM' self.ticker=ticker self.days=inputSize self.inputSize = inputSize self.binary=binary kern = 'sigmoid' self.risk_thresh = 1 - risk self.adaboost = adaboost if binary: self.clf = svm.SVC(kernel=kern) self.clf.probability=True if adaboost: self.clf = AdaBoostClassifier(base_estimator=self.clf, n_estimators=100) else: self.clf = svm.SVR(kernel=kern) if adaboost: self.clf = AdaBoostRegressor(base_estimator=self.clf, n_estimators=100) def predict(self, inputArray): inputArray = np.array(inputArray) inputArray.reshape([1,-1]) if self.binary: pred = self.clf.predict_proba(inputArray) pred = (np.array(pred)[:,1] > self.risk_thresh)*1 else: pred = self.clf.predict(inputArray) return pred def fit(self, X, Y): self.clf.fit(X,Y)
clf.fit(subTrainFeature, subTrainLabel) predictedTrainProb = clf.predict(trainFeature) predictedTestProb = clf.predict(testFeature) for item in predictedTrainProb: newTrainFeature_temp.append(item) for item in predictedTestProb: newTestFeature_temp.append(item) newTrainFeature.append(newTrainFeature_temp) newTestFeature.append(newTestFeature_temp) newTrainFeature = np.array(newTrainFeature).T newTestFeature = np.array(newTestFeature).T clf = linear_model.LogisticRegression(penalty='l2', dual=False, class_weight='auto') clf.fit(newTrainFeature, trainLabel) predictedLabel = clf.predict_proba(newTestFeature) return(predictedLabel[:, 0]) if(__name__ == "__main__"): trainFeature, trainLabel, testFeature, testPlatform = readFeature(5, 0.5, 10, 0.6, 15, 0.6, 5, 0.6, 1) ''' selectFeature = SelectKBest(chi2, k = 55) selectFeature.fit(trainFeature, trainLabel) trainFeature_new = selectFeature.transform(trainFeature) testFeature_new = selectFeature.transform(testFeature) ''' trainFeature_new = trainFeature[:, :] testFeature_new = testFeature[:, :] ''' trainFeature_new = trainFeature[:, :26] testFeature_new = testFeature[:, :26]
preds.to_csv('/Users/IkkiTanaka/Documents/KDDCup/pred/xgb/sk_GBM2.csv', header=None, index=False) new_label = a.sort(0).iloc[(a.sort(0)[0] > 0.01).values][1].values clf = GradientBoostingClassifier(n_estimators=400, learning_rate=0.05, subsample=.96, max_depth=4, verbose=1, max_features=.96, random_state=None) new_dtrain_sp = dtrain_sp[new_label] new_dval = dval[new_label] clf.fit(dtrain_sp, label_dtrain[0].values) pred = clf.predict_proba(dval) print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:, 1])) #GaussianNB from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(dtrain_sp, label_dtrain[0].values) pred = clf.predict_proba(dval) print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:, 1])) scaler = StandardScaler() dtrain_sp = scaler.fit_transform(dtrain_sp) dval = scaler.transform(dval) from sklearn import svm clf = svm.SVC(C=1,
pred = calibrated_clf.predict_proba(dtest) sample = pd.read_csv('/Users/IkkiTanaka/Documents/KDDCup/sampleSubmission.csv',header=None) preds = pd.concat([sample[0],pd.DataFrame(pred[:,1])],axis=1) preds.to_csv('/Users/IkkiTanaka/Documents/KDDCup/pred/xgb/sk_GBM2.csv' ,header=None,index=False) new_label = a.sort(0).iloc[(a.sort(0)[0]>0.01).values][1].values clf = GradientBoostingClassifier(n_estimators=400,learning_rate=0.05,subsample=.96,max_depth=4,verbose=1,max_features=.96, random_state=None) new_dtrain_sp = dtrain_sp[new_label] new_dval = dval[new_label] clf.fit(dtrain_sp, label_dtrain[0].values) pred = clf.predict_proba(dval) print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:,1])) #GaussianNB from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(dtrain_sp, label_dtrain[0].values) pred = clf.predict_proba(dval) print("ROC score", metrics.roc_auc_score(label_dval[0].values, pred[:,1])) scaler = StandardScaler() dtrain_sp = scaler.fit_transform(dtrain_sp) dval = scaler.transform(dval)