def getLogisticRegressionClf(self, X, Y): clfName = "Logistic_Regression" ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html clf = LogisticRegression( penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, ) if self._gridSearchFlag == True: log(clfName + " start searching param...") param_dist = { "penalty": ['l2', 'l2'], "C": sp_randf(1.0,3.0), "solver": [ 'lbfgs', 'liblinear'], } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) else: clf.fit(X,Y) return clf
def getRandomForestClf(self, X, Y, param_list): clfName = "Random_Forest" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = 8 tmpHighDepth = 30 param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0,1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "criterion": ["gini", "entropy"], "n_estimators" : sp_randint(5, 12), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) else: if not param_list is None: clf = rf() clf.set_params(**param_list) clf.fit(X,Y) return clf
def getLogisticRegressionClf(self, X, Y): clfName = "Logistic_Regression" ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html clf = LogisticRegression( penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, ) if self._gridSearchFlag == True: log(clfName + " start searching param...") param_dist = { "penalty": ['l2', 'l2'], "C": sp_randf(1.0, 3.0), "solver": ['lbfgs', 'liblinear'], } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def getRandomForestClf(self, X, Y): clfName = "Random_Forest" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score=True) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = 10 tmpHighDepth = 50 param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0, 1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "criterion": ["gini", "entropy"], "n_estimators": sp_randint(100, 300), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def getExtraTressClf(self, X, Y): clfName = "Extra_Trees" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html clf = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = int(len(X.columns) * 0.7) tmpHighDepth = int(len(X.columns)) param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0, 1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, True], "criterion": ["gini", "entropy"], "oob_score": [True, True], "n_estimators": sp_randint(100, 300), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
test_parse_str = sys.argv[2] # "*s1024_pext_0_stru_his_txt_features.csv" output_prefix = sys.argv[3] basic_features = [] # do not include patch location lin_svm_params = {"C": [1.0, 10.0, 100.0]} rf_params = {"n_estimators": [10, 30, 50, 100], "max_depth": [10, 20, 40, None], "min_samples_leaf": [5, 20, 50, 100], "max_features": [0.2, 0.5, 'auto']} rf_param_dist = {"n_estimators": sp_randint(40, 100), "max_depth": sp_randint(10, 25), "min_samples_leaf": sp_randint(1, 10), "min_samples_split": sp_randint(4, 15), "max_features": sp_randf(loc=0.1, scale=0.45) } knn_params = {"n_neighbors": [3, 5, 10, 40, 60]} label_dict = {'NO': -1, 'TU': 1, 'BO': 1, 'OT': 1} algorithms = [#(LinearSVC(), lin_svm_params, "Linear SVM"), (RandomForestClassifier(class_weight="balanced", oob_score=False, n_jobs=4, bootstrap=True, criterion='gini'), rf_param_dist, "RandomForest", "RAND"), # (KNeighborsClassifier(), knn_params, "KNN Clf") ] add_features = [#["sprel_", "sp_"], # ["tx_", "wx_"], # ["wx_cE", "wx_cH", "sprel_", "sp_"], # ["HIST_", "wx_cE", "wx_cH"], ["hist_u", "hist_cG", "wx_", "sp_num", "sp_mean", "rel_area", "scale"],