Exemplo n.º 1
0
class ExtraTreesPreprocessorRegression(AutoSklearnPreprocessingAlgorithm):

    def __init__(self, n_estimators, criterion, min_samples_leaf,
                 min_samples_split, max_features,
                 max_leaf_nodes_or_max_depth="max_depth",
                 bootstrap=False, max_leaf_nodes=None, max_depth="None",
                 min_weight_fraction_leaf=0.0,
                 oob_score=False, n_jobs=1, random_state=None, verbose=0):

        self.n_estimators = int(n_estimators)
        self.estimator_increment = 10
        if criterion not in ("mse", ):
            raise ValueError("'criterion' is not in ('mse', ): "
                             "%s" % criterion)
        self.criterion = criterion

        if max_leaf_nodes_or_max_depth == "max_depth":
            self.max_leaf_nodes = None
            if max_depth == "None":
                self.max_depth = None
            else:
                self.max_depth = int(max_depth)
                # if use_max_depth == "True":
                #    self.max_depth = int(max_depth)
                #elif use_max_depth == "False":
                #    self.max_depth = None
        else:
            if max_leaf_nodes == "None":
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(max_leaf_nodes)
            self.max_depth = None

        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)

        self.max_features = float(max_features)

        if bootstrap == "True":
            self.bootstrap = True
        elif bootstrap == "False":
            self.bootstrap = False

        self.oob_score = oob_score
        self.n_jobs = int(n_jobs)
        self.random_state = random_state
        self.verbose = int(verbose)
        self.preprocessor = None

    def fit(self, X, Y):
        from sklearn.ensemble import ExtraTreesRegressor

        num_features = X.shape[1]
        max_features = int(
            float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        self.preprocessor = ExtraTreesRegressor(
            n_estimators=self.n_estimators, criterion=self.criterion,
            max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
            max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
            random_state=self.random_state)
        self.preprocessor.fit(X, Y)

        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'ETR',
                'name': 'Extra Trees Regressor Preprocessing',
                'handles_regression': True,
                'handles_classification': False,
                'handles_multiclass': False,
                'handles_multilabel': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (INPUT,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100))
        criterion = cs.add_hyperparameter(Constant("criterion", "mse"))
        max_features = cs.add_hyperparameter(UniformFloatHyperparameter(
            "max_features", 0.5, 5, default=1))

        max_depth = cs.add_hyperparameter(
            UnParametrizedHyperparameter(name="max_depth", value="None"))

        min_samples_split = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "min_samples_split", 2, 20, default=2))
        min_samples_leaf = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "min_samples_leaf", 1, 20, default=1))
        min_weight_fraction_leaf = cs.add_hyperparameter(Constant(
            'min_weight_fraction_leaf', 0.))

        bootstrap = cs.add_hyperparameter(CategoricalHyperparameter(
            "bootstrap", ["True", "False"], default="False"))

        return cs
Exemplo n.º 2
0
with open(out_filename+'_estimators_.txt','wt') as f:
  #f.write(xfr.estimators_)
  print >> f, xfr.estimators_
np.savetxt(out_filename+'_feature_importances_.txt',xfr.feature_importances_)
print data_train.columns.shape,xfr.feature_importances_.shape
with open(out_filename+'_fimp.txt','wt') as f:
  for feat,imp in zip(data_train.columns,xfr.feature_importances_):
    print >>f,"%s,%g"%(feat,imp)
#with open(out_filename+'_feature_importances_.txt','wt') as f:
  #print >> f, xfr.feature_importances_
#with open('oob_score_.txt','wt') as f:
  #print >> f, xfr.oob_score_
#with open('oob_prediction_.txt','wt') as f:
  #print >> f, xfr.oob_prediction_

transformed_train = xfr.transform(data_train)
transformed_test = xfr.transform(data_test)
end = time.clock()
print >> log, "time = ", end-start


suffix = '_tr.csv'
train_filename = (os.path.splitext(os.path.basename(sys.argv[1]))[0]+suffix)
train = pd.DataFrame(transformed_train)
train = pd.concat([data_train_in.ix[:,'target'],train],axis=1)
train = pd.concat([data_train_in.ix[:,'id'],train],axis=1)
train.to_csv(train_filename,index=0)
test_filename = (os.path.splitext(os.path.basename(sys.argv[2]))[0]+suffix)
test = pd.DataFrame(transformed_test)
if 'target' in data_test_in:
  test = pd.concat([data_test_in.ix[:,'target'],test],axis=1)
Exemplo n.º 3
0
class mixmodels:
    def __init__(self,nest=10):
        self.nest = nest
    def fit(self,data_train,target):
        self.target_train = target
        self.catcol = data_train.filter(like='var').columns.tolist()
        #start_gbr_tr = time.clock()
        self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr.fit(data_train,self.target_train)
        self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean")
        self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train)
        #end_gbr_tr = time.clock()
        #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr

        #start_xfr_tr = time.clock()
        self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr.fit(data_train,self.target_train)
        self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean")
        self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train)
        #end_xfr_tr = time.clock()
        #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr

        #start_gbr_cat = time.clock()
        self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_gbr_cat = time.clock()
        #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat

        #start_xfr_cat = time.clock()
        self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_xfr_cat = time.clock()
        #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat
        return self

    def predict(self,data_test):
        mix_test_list = []

        transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean")
        mix_test_list += [pd.Series(self.gbr_tr_fit.predict(transformed_test_gbr))]

        transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean")
        mix_test_list += [pd.Series(self.xfr_tr_fit.predict(transformed_test_xfr))]

        mix_test_list += [pd.Series(self.gbr_cat_fit.predict(data_test[self.catcol]))]

        mix_test_list += [pd.Series(self.xfr_cat_fit.predict(data_test[self.catcol]))]

        mix_test = pd.concat(mix_test_list,1)

        mix_ave = mix_test.mean(1)
        mix_ave.name='target'

        return mix_ave
    def score(self,data_test,target_test):
        total_score = []
        transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean")
        total_score += [ self.gbr_tr_fit.score(transformed_test_gbr,target_test) ]
        transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean")
        total_score += [ self.xfr_tr_fit.score(transformed_test_xfr,target_test) ]
        total_score += [ self.gbr_cat_fit.score(data_test[self.catcol],target_test) ]
        total_score += [ self.xfr_cat_fit.score(data_test[self.catcol],target_test) ]
        return sum(total_score)/float(len(total_score))

    def gini(self,data_test,target_test):
        weight = data_test.var11
        gns = []
        transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean")
        gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_tr_fit.predict(transformed_test_gbr).tolist(),weight.tolist()) ]
        transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean")
        gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_tr_fit.predict(transformed_test_xfr).tolist(),weight.tolist()) ]
        gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ]
        gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ]
        return sum(gns)/float(len(gns))
Exemplo n.º 4
0
with open(out_filename+'_estimators_.txt','wt') as f:
  #f.write(xfr.estimators_)
  print >> f, xfr.estimators_
np.savetxt(out_filename+'_feature_importances_.txt',xfr.feature_importances_)
print data_train.columns.shape,xfr.feature_importances_.shape
with open(out_filename+'_fimp.txt','wt') as f:
  for feat,imp in zip(data_train.columns,xfr.feature_importances_):
    print >>f,"%s,%g"%(feat,imp)
#with open(out_filename+'_feature_importances_.txt','wt') as f:
  #print >> f, xfr.feature_importances_
#with open('oob_score_.txt','wt') as f:
  #print >> f, xfr.oob_score_
#with open('oob_prediction_.txt','wt') as f:
  #print >> f, xfr.oob_prediction_

transformed_train = xfr.transform(data_train,threshold="0.4*mean")
transformed_test = xfr.transform(data_test,threshold="0.4*mean")
end = time.clock()
print >> log, "time = ", end-start


suffix = '_tr.csv'
train_filename = (os.path.splitext(os.path.basename(sys.argv[1]))[0]+suffix)
train = pd.DataFrame(transformed_train)
train = pd.concat([data_train_in.ix[:,'target'],train],axis=1)
train = pd.concat([data_train_in.ix[:,'id'],train],axis=1)
train.to_csv(train_filename,index=0)
test_filename = (os.path.splitext(os.path.basename(sys.argv[2]))[0]+suffix)
test = pd.DataFrame(transformed_test)
if 'target' in data_test_in:
  test = pd.concat([data_test_in.ix[:,'target'],test],axis=1)
Exemplo n.º 5
0
    #pca = PCA(n_components=n_components)
    #X = pca.fit_transform(X)

    print(X.shape)
    print(type(X))
    print(y.shape)
    print(type(y))
    print(y.shape)

    estimator = Ridge()
    #selector = RFECV(estimator, step=1, cv=5)

    selector = ExtraTreesRegressor(n_estimators=50)
    selector = selector.fit(X, y)
    print("Optimal number of features : %d" % selector.n_features_)
    X = selector.transform(X)
    print(X.shape)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)

    # build a classifier
    clf = RandomForestRegressor(n_estimators=20)
    # use a full grid over all parameters
    param_grid = {
        "max_depth": [3, None],
        "max_features": [1, 3, 10],
        "min_samples_split": [2, 3, 10],
        "min_samples_leaf": [1, 3, 10],