示例#1
0
    def getLogisticRegressionClf(self, X, Y):
        clfName = "Logistic_Regression"
        
        ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
        clf = LogisticRegression(
                                penalty='l2', 
                                dual=False, 
                                tol=0.0001, 
                                C=1.0, 
                                fit_intercept=True, 
                                intercept_scaling=1, 
                                class_weight=None, 
                                random_state=None, 
                                solver='liblinear', 
                                max_iter=100, 
                                multi_class='ovr', 
                                verbose=0, 


                                )
        
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            
            param_dist = {
                          "penalty": ['l2', 'l2'],
                          "C": sp_randf(1.0,3.0),
                          "solver": [ 'lbfgs', 'liblinear'],
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
        else:
            clf.fit(X,Y)
                
        return clf
示例#2
0
    def getRandomForestClf(self, X, Y, param_list):
        clfName = "Random_Forest"
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
        clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True)
            
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = 8
            tmpHighDepth = 30
            
            
            param_dist = {
                          "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                          "max_features": sp_randf(0,1),
                          "min_samples_split": sp_randint(1, 11),
                          "min_samples_leaf": sp_randint(1, 11),
                          "criterion": ["gini", "entropy"], 
                          "n_estimators" : sp_randint(5, 12),
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
        
        else:    

            if not param_list is None:
                clf = rf()
                clf.set_params(**param_list)
            clf.fit(X,Y)    
            
        return clf
示例#3
0
    def getLogisticRegressionClf(self, X, Y):
        clfName = "Logistic_Regression"

        ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
        clf = LogisticRegression(
            penalty='l2',
            dual=False,
            tol=0.0001,
            C=1.0,
            fit_intercept=True,
            intercept_scaling=1,
            class_weight=None,
            random_state=None,
            solver='liblinear',
            max_iter=100,
            multi_class='ovr',
            verbose=0,
        )

        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")

            param_dist = {
                "penalty": ['l2', 'l2'],
                "C": sp_randf(1.0, 3.0),
                "solver": ['lbfgs', 'liblinear'],
            }

            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)

        return clf
示例#4
0
    def getRandomForestClf(self, X, Y):
        clfName = "Random_Forest"
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
        clf = rf(n_estimators=300,
                 max_depth=None,
                 min_samples_split=1,
                 random_state=0,
                 bootstrap=True,
                 oob_score=True)

        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = 10
            tmpHighDepth = 50

            param_dist = {
                "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                "max_features": sp_randf(0, 1),
                "min_samples_split": sp_randint(1, 11),
                "min_samples_leaf": sp_randint(1, 11),
                "criterion": ["gini", "entropy"],
                "n_estimators": sp_randint(100, 300),
            }

            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)

        return clf
示例#5
0
    def getExtraTressClf(self, X, Y):
        clfName = "Extra_Trees"

        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        clf = ExtraTreesClassifier(n_estimators=10,
                                   criterion='gini',
                                   max_depth=None,
                                   min_samples_split=2,
                                   min_samples_leaf=1,
                                   min_weight_fraction_leaf=0.0,
                                   max_features='auto',
                                   max_leaf_nodes=None,
                                   bootstrap=False,
                                   oob_score=False,
                                   n_jobs=1,
                                   random_state=None,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight=None)

        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = int(len(X.columns) * 0.7)
            tmpHighDepth = int(len(X.columns))

            param_dist = {
                "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                "max_features": sp_randf(0, 1),
                "min_samples_split": sp_randint(1, 11),
                "min_samples_leaf": sp_randint(1, 11),
                "bootstrap": [True, True],
                "criterion": ["gini", "entropy"],
                "oob_score": [True, True],
                "n_estimators": sp_randint(100, 300),
            }

            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)

        return clf
        test_parse_str = sys.argv[2]    # "*s1024_pext_0_stru_his_txt_features.csv"
        output_prefix = sys.argv[3]

    basic_features = []  # do not include patch location

    lin_svm_params = {"C": [1.0, 10.0, 100.0]}
    rf_params = {"n_estimators": [10, 30, 50, 100],
                 "max_depth": [10, 20, 40, None],
                 "min_samples_leaf": [5, 20, 50, 100],
                 "max_features": [0.2, 0.5, 'auto']}

    rf_param_dist = {"n_estimators": sp_randint(40, 100),
                     "max_depth": sp_randint(10, 25),
                     "min_samples_leaf": sp_randint(1, 10),
                     "min_samples_split": sp_randint(4, 15),
                     "max_features": sp_randf(loc=0.1, scale=0.45)
                     }
    knn_params = {"n_neighbors": [3, 5, 10, 40, 60]}
    label_dict = {'NO': -1, 'TU': 1, 'BO': 1, 'OT': 1}

    algorithms = [#(LinearSVC(), lin_svm_params, "Linear SVM"),
                  (RandomForestClassifier(class_weight="balanced", oob_score=False, n_jobs=4,
                                          bootstrap=True, criterion='gini'), rf_param_dist, "RandomForest", "RAND"),
                  # (KNeighborsClassifier(), knn_params, "KNN Clf")
                  ]

    add_features = [#["sprel_", "sp_"],
                    #  ["tx_", "wx_"],
                    # ["wx_cE", "wx_cH", "sprel_", "sp_"],
                    # ["HIST_", "wx_cE", "wx_cH"],
                    ["hist_u", "hist_cG", "wx_", "sp_num", "sp_mean", "rel_area", "scale"],