示例#1
0
def pipe(stage_1,
         stage_2,
         y_1,
         y_2,
         to_pred,
         regr=RandomForestRegressor(max_depth=5, n_estimators=100)):

    #guess it should be totally doable on pipelines and unions but I'm lazy and don't want to write transforms

    selector_cat = SelectFromModel(RandomForestRegressor(max_depth=5,
                                                         n_estimators=100),
                                   threshold='median')
    selector_bin = RandomizedLasso()

    selector_cat.fit(stage_1[objs], y_1)
    selector_bin.fit(stage_1[ints], y_1)

    trunc = np.hstack([
        selector_cat.transform(stage_2[objs]),
        selector_bin.transform(stage_2[ints])
    ])
    trunc_test = np.hstack([
        selector_cat.transform(to_pred[objs]),
        selector_bin.transform(to_pred[ints])
    ])

    pca = PCA(n_components=12, random_state=420).fit(stage_1)

    regr_no_pca = clone(regr)
    regr_with_pca = clone(regr)

    trunc_with_pca = np.hstack([trunc, pca.transform(stage_2)])
    trunc_with_pca_test = np.hstack([trunc_test, pca.transform(to_pred)])

    regr_no_pca.fit(trunc, y_2)
    regr_with_pca.fit(trunc_with_pca, y_2)

    return regr_no_pca.predict(trunc_test) / 2 + regr_with_pca.predict(
        trunc_with_pca_test) / 2
示例#2
0
def main(train_label, train_feat, modelsdir, selfeat):

    X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' '))
    y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' '))

    X_trains = X_train
    scaler = StandardScaler().fit(X_train)
    X_trains = scaler.transform(X_train)

    # performs feature selection
    featsel_str = ".all-feats"
    if int(selfeat):
        print "Performing feature selection ..."
        # initializes selection estimator
        sel_est = RandomizedLasso(alpha="bic",
                                  verbose=True,
                                  max_iter=1000,
                                  n_jobs=int(config['n_jobs']),
                                  random_state=42,
                                  n_resampling=1000)

        sel_est.fit(X_trains, y_train)
        X_trains = sel_est.transform(X_trains)

        selected_mask = sel_est.get_support()
        selected_features = sel_est.get_support(indices=True)

        sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)])

        # saves indices
        np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
        # saves mask
        np.save(sel_feats_path + ".mask", selected_mask)
        featsel_str = ".randcv"

    estimator = ExtraTreesRegressor(random_state=42,
                                    n_jobs=int(config['n_jobs']))

    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # performs parameter optimization using random search
    print "Performing parameter optimization ... "


    param_distributions = \
      {"n_estimators": [5, 10, 50, 100, 200, 500],
       "max_depth": [3, 2, 1, None],
       "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
       "min_samples_split": sp_randint(1, 11),
       "min_samples_leaf": sp_randint(1, 11),
       "bootstrap": [True, False]}
    # "criterion": ["gini", "entropy"]}

    search = RandomizedSearchCV(estimator,
                                param_distributions,
                                n_iter=int(config['RR_Iter']),
                                scoring=mae_scorer,
                                n_jobs=int(config['n_jobs']),
                                refit=True,
                                cv=KFold(X_train.shape[0],
                                         int(config['folds']),
                                         shuffle=True,
                                         random_state=42),
                                verbose=1,
                                random_state=42)

    # fits model using best parameters found
    search.fit(X_trains, y_train)

    # ................SHAHAB ........................

    models_dir = sorted(glob.glob(modelsdir + os.sep + "*"))

    estimator2 = ExtraTreesRegressor(
        bootstrap=search.best_params_["bootstrap"],
        max_depth=search.best_params_["max_depth"],
        max_features=search.best_params_["max_features"],
        min_samples_leaf=search.best_params_["min_samples_leaf"],
        min_samples_split=search.best_params_["min_samples_split"],
        n_estimators=search.best_params_["n_estimators"],
        verbose=1,
        random_state=42,
        n_jobs=int(config['n_jobs']))

    print "Train the model with the best parameters ..."
    estimator2.fit(X_trains, y_train)

    from sklearn.externals import joblib
    joblib.dump(estimator2, modelsdir + "/XRT.pkl")
    joblib.dump(scaler, modelsdir + "/scaler.pkl")
    joblib.dump(sel_est, modelsdir + "/sel_est.pkl")
示例#3
0
class LinearAll:
    """
    A repertoire of Linear Variable Selection and Prediction Models

    Parameters
    ----------
    n_jobs : int, optional
        Number of jobs to run in parallel (default 1).
        If -1 all CPUs are used. This will only provide speedup for
        n_targets > 1 and sufficient large problems
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be:
        None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs
        An int, giving the exact number of total jobs that are spawned
        A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’
    refit : boolean
        Refit the best estimator with the entire dataset. If “False”,
        it is impossible to make predictions using this GridSearchCV
        instance after fitting.
    iid : boolean, optional
        If True, the data is assumed to be identically distributed across
        the folds, and the score is computed from all samples individually,
        and not the mean loss across the folds.
        (If the number of data points is the same across folds, either
        returns the same thing)

    Attributes
    ----------
    ols_train,
    predictions models before variable selection
    predictions models after variable selection
    """

    def __init__ (self, cv=20, scoring = 'mean_squared_error',
                  n_jobs=1, refit=False, iid=False, pre_pred=True,
                  param_ridge_post=list(np.arange(1,3,0.1)),
                    rlasso_selection_threshold = 0.5):
        #self.__name__ = '__main__'
        """
        CAUTION: we changed to __main__ so that parallelization works
        """
        self.cv = cv
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.iid = iid
        self.pre_pred =pre_pred
        self.param_ridge_post = param_ridge_post
        self.rlasso_selection_threshold = rlasso_selection_threshold

    def run_models(self, X, y, param_ridge):
        """

        Prediction Models.

        OLS, PLS, Ridge

        """

        ##################################
        ## OLS CV
        ##################################
        #ols = linear_model.LinearRegression(fit_intercept=True,
        #                                          normalize=False,
        #                                          copy_X=True)
        #ols_cv_score = cross_validation.cross_val_score(
        #        ols, X, y,
        #        cv=self.cv, scoring=self.scoring,
        #        n_jobs=self.n_jobs)
        """
        self.ols_cv_score.shape = (cv,)
        """

        ##################################
        ## PLS CV
        ##################################
        tuned_parameters = [{'n_components': range(1, 5)}]
        pls = PLSRegression()
        pls_cv = GridSearchCV(pls, tuned_parameters,
                                cv=self.cv, scoring=self.scoring,
                                n_jobs=self.n_jobs,
                                refit=self.refit, iid=self.iid)
        pls_cv.fit(X, y)


        ##################################
        ## Ridge CV
        ##################################
        tuned_parameters = [{'alpha': param_ridge}]
        ridge = linear_model.Ridge(alpha = 1)
        ridge_cv = GridSearchCV(ridge, tuned_parameters,
                                     cv=self.cv, scoring=self.scoring,
                                     n_jobs=self.n_jobs,
                                     refit=self.refit, iid=self.iid)
        ridge_cv.fit(X, y)

        return (pls_cv, ridge_cv)

    def fit(self, X, y):
        """
        Variable Selection and Prediction.

        Variable Selection Model: lasso
        Prediction Models: see self.predict()

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values

        Returns
        -------
        self : returns an instance of self.
        """


        ##################################
        ## OLS Train
        ##################################
        #ols_train = linear_model.LinearRegression(fit_intercept=True,
        #                                         normalize=False,
        #                                          copy_X=True)
        #ols_train.fit(X, y)
        #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2)
        """
        fit_intercept=True, center the data
        copy=True, because centering data invovles X -= X_mean

        CAUTION:
        normalization=False, otherwise involves taking squares of X, lose precision

        self.rss_ols_train.shape = (1,1)
        """

        ##################################
        ## Pre Variable Selection Predictions
        ##################################
        self.pre_pred = False
        if self.pre_pred:
            print "Computing ... "
            param_ridge_pre = list(np.arange(1e9,2e9,1e8))
            self.pls_pre, self.ridge_pre = \
                self.run_models(X, y, param_ridge_pre)

        ##################################
        ## Lasso Variable Selection
        ##################################
        self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=self.cv, n_jobs=self.n_jobs)
        self.lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        if self.rlasso_selection_threshold == 0:
            self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_,
                                fit_intercept=True, normalize=True, precompute='auto',
                                max_iter=X.shape[1]+1000,
                                eps=2.2204460492503131e-16, copy_X=True,
                                fit_path=False)
            self.lasso_refit.fit(X, y)
            self.active = self.lasso_refit.coef_ != 0
            self.active = self.active[0,:]
            X_selected = X[:, self.active]
        else:
            self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5,
                                          sample_fraction=0.75, n_resampling=200,
                                          selection_threshold=self.rlasso_selection_threshold, fit_intercept=True,
                                          verbose=False, normalize=True, precompute='auto',
                                          max_iter=500, eps=2.2204460492503131e-16,
                                          random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',)
            self.rlasso.fit(X, y)
            X_selected = self.rlasso.transform(X)

        ##################################
        ## Post Variable Selection Predictions
        ##################################
        self.pls_post, self.ridge_post = \
            self.run_models(X_selected, y, self.param_ridge_post)


        return self

    def predict(self, X_test):
        assert(self.refit == True)
        if self.pls_post.best_score_ > self.ridge_post.best_score_:
            self.best_model = self.pls_post
            print "Chosen Model: pls"
        else:
            self.best_model = self.ridge_post
            print "Chosen Model: ridge"

        if self.rlasso_selection_threshold == 0:
            X_test_selected = X_test[:, self.active]
        else:
            X_test_selected = self.rlasso.transform(X_test)
        return self.best_model.best_estimator_.predict(X_test_selected)
示例#4
0
def main(train_label, train_feat, modelsdir, selfeat):

  X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' '))
  y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' '))

  X_trains = X_train
  scaler = StandardScaler().fit(X_train)
  X_trains = scaler.transform(X_train)


    # performs feature selection
  featsel_str = ".all-feats"
  if int(selfeat):
    print "Performing feature selection ..."
    # initializes selection estimator
    sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                              n_jobs=int(config['n_jobs']), random_state=42,
                              n_resampling=1000)
  
    sel_est.fit(X_trains, y_train)
    X_trains = sel_est.transform(X_trains)
  
    selected_mask = sel_est.get_support()
    selected_features = sel_est.get_support(indices=True)
  
    sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)])
  
    # saves indices
    np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
    # saves mask
    np.save(sel_feats_path + ".mask", selected_mask)
    featsel_str = ".randcv"


  estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs']))

  mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
  #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

  # performs parameter optimization using random search
  print "Performing parameter optimization ... "


  param_distributions = \
    {"n_estimators": [5, 10, 50, 100, 200, 500],
     "max_depth": [3, 2, 1, None],
     "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
     "min_samples_split": sp_randint(1, 11),
     "min_samples_leaf": sp_randint(1, 11),
     "bootstrap": [True, False]}
   # "criterion": ["gini", "entropy"]}

  search = RandomizedSearchCV(estimator, param_distributions,
            n_iter=int(config['RR_Iter']),
            scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True,
            cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42),
            verbose=1, random_state=42)
  
  # fits model using best parameters found
  search.fit(X_trains, y_train)

  # ................SHAHAB ........................ 
  
  models_dir = sorted(glob.glob(modelsdir + os.sep + "*"))
  
  estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
       max_depth=search.best_params_["max_depth"], 
       max_features=search.best_params_["max_features"],
       min_samples_leaf=search.best_params_["min_samples_leaf"], 
       min_samples_split=search.best_params_["min_samples_split"], 
       n_estimators=search.best_params_["n_estimators"], 
       verbose=1, 
       random_state=42, 
       n_jobs=int(config['n_jobs']))

  print "Train the model with the best parameters ..."
  estimator2.fit(X_trains,y_train)

  from sklearn.externals import joblib
  joblib.dump(estimator2, modelsdir+"/XRT.pkl")
  joblib.dump(scaler, modelsdir+"/scaler.pkl")
  joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
示例#5
0
def main():
    start = time.time()
    MAX_TRAIN_SIZE = 126838
    train_size = 20000
    val_size = MAX_TRAIN_SIZE - train_size
    data, test_data = get_data('data')
    X = data[0:train_size, 0:-1]
    y = [lbl for lbl in data[0:train_size, -1]]
    print(X.shape)
    print(len(y))
    # use randomized log regression for feature selection
    clfR = RandomizedLasso(
        alpha='aic',
        scaling=0.5,
        sample_fraction=0.75,
        n_resampling=200,
        selection_threshold=0.25,
        fit_intercept=True,
        verbose=False,
        normalize=True,
        precompute='auto',
        max_iter=500,
        eps=2.2204460492503131e-16,
        random_state=None,
        n_jobs=1,
        pre_dispatch='3*n_jobs',
        #memory=Memory(cachedir=None)
    )
    # fit regresion
    clfR.fit(X, y)

    # Transform Train Data to selected features
    X = np.array(
        X).copy()  # little hack to fix assignment dest. read only error
    X_new = clfR.transform(X)
    X = X_new
    ## transform Quiz Dataset
    test_data = np.array(test_data).copy(
    )  # little hack to fix assignment dest. read only error
    transformed_test_data = clfR.transform(test_data)
    test_data = transformed_test_data

    print('Dimensions after feature Reduction: ' + str(X.shape))
    print("Elapsed Time For Feature Reduction: " + str(duration))

    # Training classifier
    clf1 = DecisionTreeClassifier(criterion='gini',
                                  splitter='best',
                                  max_depth=None,
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0,
                                  max_features=None,
                                  random_state=None,
                                  max_leaf_nodes=None,
                                  class_weight=None,
                                  presort=False)

    # fit sub-classifiers
    clf1.fit(X, y)
    # fit voting classifier
    print("Elapsed Time For Classifier Training: " + str(duration))

    # predict & calculate training error
    y_hat = clf1.predict(X)
    test_err = 1
    for yi, y_hati in zip(y, y_hat):
        test_err += (yi == y_hati)
    test_err /= train_size
    print("train: " + str(test_err))

    # validation data - calculate valdiation error
    val_start = train_size
    val_end = train_size + val_size

    # get validation data set
    # TODO: put this back in
    if MAX_TRAIN_SIZE - train_size > val_size:
        print("Beginning test validation...")
        X_val = data[val_start:val_end, 0:-1]
        y_val = [lbl for lbl in data[val_start:val_end, -1]]
        y_val_hat = clf1.predict(X_val)
        test_err = 1
        for yi, y_hati in zip(y_val, y_val_hat):
            test_err += (yi == y_hati)
        test_err /= X_val.shape[0]
        print("val: " + str(test_err))

    #quiz data
    print("Beginning quiz validation...")
    # test_data = get_data('quiz')
    X_test = test_data[:, :]
    print(X_test.shape)
    y_test = [lbl for lbl in data[:, -1]]
    y_test_hat = clf1.predict(X_test)
    test_err = 1
    #    for yi, y_hati in zip(y_test, y_test_hat):
    #        test_err += (yi == y_hati)
    #    test_err /= X_test.shape[0]
    #    print("test: " + str(test_err))
    store_csv(y_test_hat, "prediction")
    end = time.time()
    duration = end - start
    print("Took this many seconds: " + str(duration))
示例#6
0
def run(args):
    X_train = np.nan_to_num(
        np.genfromtxt(args.training_data, delimiter=args.delimiter))
    y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1)

    X_trains = X_train
    if args.scale:
        print "Scaling features (mean removal divided by std)..."
        scaler = StandardScaler().fit(X_train)
        X_trains = scaler.transform(X_train)

    # create output folders
    outF = args.output_folder + "/" + os.path.basename(
        args.training_data) + "--FS_" + str(
        args.select_features) + "--i_" + str(args.iterations)
    buildDir(outF)
    maskF = outF + "/masks/"
    buildDir(maskF)
    #evaluation  features  first_experiments  labels  logs  masks  parameters
    #  predictions  src  suca
    paramF = outF + "/parameters/"
    buildDir(paramF)
    #featF = outF+"/features/"
    #buildDir(featF)    

    #evalF = buildDir(outF+"/evaluation")



    #os.path.basename(
    #        args.training_data)]) + featsel_str + "--" + os.path.basename(
    # test_label



    # initializes numpy random seed
    np.random.seed(args.seed)

    # performs feature selection
    featsel_str = ".all-feats"
    if args.select_features:
        print "Performing feature selection ..."
        # initializes selection estimator
        sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                                  n_jobs=8, random_state=args.seed,
                                  n_resampling=1000)

        sel_est.fit(X_trains, y_train)
        X_trains = sel_est.transform(X_trains)

        selected_mask = sel_est.get_support()
        selected_features = sel_est.get_support(indices=True)

        sel_feats_path = os.sep.join(
            #    [".", "masks", os.path.basename(args.training_data)])
            [maskF, os.path.basename(args.training_data)])

        # saves indices
        np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
        # saves mask
        np.save(sel_feats_path + ".mask", selected_mask)
        featsel_str = ".randcv"

    estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1)

    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # performs parameter optimization using random search
    print "Performing parameter optimization ... "


    param_distributions = \
        {"n_estimators": [5, 10, 50, 100, 200, 500],
         "max_depth": [3, 2, 1, None],
         "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
         "min_samples_split": sp_randint(1, 11),
         "min_samples_leaf": sp_randint(1, 11),
         "bootstrap": [True, False]}
         # "criterion": ["gini", "entropy"]}

    search = RandomizedSearchCV(estimator, param_distributions,
                                n_iter=args.iterations,
                                scoring=mae_scorer, n_jobs=8, refit=True,
                                cv=KFold(X_train.shape[0], args.folds, shuffle=True,
                                         random_state=args.seed), verbose=1,
                                random_state=args.seed)

    # fits model using best parameters found
    search.fit(X_trains, y_train)

    # ................SHAHAB ........................ 
    
    models_dir = sorted(glob.glob(args.models_dir + os.sep + "*"))
    
    estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
                                     max_depth=search.best_params_["max_depth"], 
                                     max_features=search.best_params_["max_features"],
                                     min_samples_leaf=search.best_params_["min_samples_leaf"], 
                                     min_samples_split=search.best_params_["min_samples_split"], 
                                     n_estimators=search.best_params_["n_estimators"], 
                                     verbose=1, 
                                     random_state=42, 
                                     n_jobs=8)
   
    estimator2.fit(X_trains,y_train)
    from sklearn.externals import joblib
    print "koooonnn %s" % args.models_dir
    joblib.dump(estimator2, args.models_dir+"/XRT.pkl")
    joblib.dump(scaler, args.models_dir+"/scaler.pkl")
    joblib.dump(sel_est, args.models_dir+"/sel_est.pkl")
    
#    print "Kioonnn number of feat:\n", n_feature
    # ................SHAHAB ........................

    print "Best parameters: ", search.best_params_

    # saves parameters on yaml file
    #param_path = os.sep.join([".", "parameters", os.path.basename(
    param_path = os.sep.join([paramF, os.path.basename(
        args.training_data)]) + featsel_str + ".params.yaml"
    param_file = codecs.open(param_path, "w", "utf-8")
    yaml.dump(search.best_params_, stream=param_file)
    testF = os.sep.join([outF, "/test/"])
    buildDir(testF)

    m = y_train.mean()

    # evaluates model on the different test sets
    test_features = sorted(glob.glob(args.test_data + os.sep + "*"))
    test_labels = sorted(glob.glob(args.test_labels + os.sep + "*"))
    for test_feature, test_label in zip(test_features, test_labels):
        print "Evaluating on %s" % test_label
    	X_test = np.nan_to_num(
        	np.genfromtxt(test_feature, delimiter=args.delimiter))
    	y_test = np.clip(np.genfromtxt(test_label), 0, 1)

    	X_tests = X_test
    	if args.scale:
        	X_tests = scaler.transform(X_test)

    	if args.select_features:
        	X_tests = sel_est.transform(X_tests)

    	# gets predictions on test set
    	#y_pred = search.predict(X_tests)
    	y_pred = np.clip(search.predict(X_tests), 0, 1)

    	# evaluates on test set
    	mae = mean_absolute_error(y_test, y_pred)
    	rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    	print "Test MAE = %2.8f" % mae
    	print "Test RMSE = %2.8f" % rmse
    	print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max())
    	# saves evaluation
    	testFX = testF + "/" + os.path.basename(test_label)
    	buildDir(testFX)
    	buildDir(testFX + "/evaluation/")

    	eval_path = os.sep.join([testFX, "evaluation", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label)
    	mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8")
    	mae_eval.write(str(mae) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8")
    	rmse_eval.write(str(rmse) + "\n")

    	mu = m * np.ones(y_test.shape[0])  # baseline on test set
    	maeB = mean_absolute_error(y_test, mu)
    	rmseB = np.sqrt(mean_squared_error(y_test, mu))
    	print "Test MAE Baseline= %2.8f" % maeB
    	print "Test RMSE Baseline= %2.8f" % rmseB
    	mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8")
    	mae_eval.write(str(maeB) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8")
    	rmse_eval.write(str(rmseB) + "\n")



	# saves predictions
	buildDir(testFX + "/predictions/")
	preds_path = os.sep.join([testFX, "predictions", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label) + ".preds"
	np.savetxt(preds_path, y_pred, fmt="%2.15f")
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    attribute[idx] = attr


# In[ ]:

# select  the best features with true values and save them
features = pd.read_csv('all_features.csv',index_col=0).sort()
target = pd.read_csv('targets_for_feature_selection.csv',index_col=0).sort()#replace this with targets_for_feature_selection_LB_incl.csv if LB data is included
for i in range(21):
    print(attribute[i])
    sys.stdout.flush()
    
    
    Y = target[attribute[i]].dropna()
    X = features.loc[Y.index]
    selector = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=200,
                               random_state=25).fit(X,Y)
    selected = pd.DataFrame(selector.transform(features))
    selected.index = features.index
    print('shape ', selected.shape)
    
    selected.to_csv('...path to features folder/selected_features/features_'+str(i)+'.csv')


# In[ ]:



示例#8
0
def do_ml(day):
    ################################################################
    # Modules to use
    ###############################################################
    USE_SAX = False
    FEATURE_REDUCTION = False
    DIM_REDUCTION_SEARCH = False

    #Create folder for administration
    try:
        os.mkdir('performance-{}-days'.format(day))
        os.mkdir('performance-{}-days/models'.format(day))
    except FileExistsError as e:
        None
    ################################################################

    print('Using sax: {}'.format(USE_SAX))
    if USE_SAX:
        X = pickle.load(open('X_sax.p', 'rb'))
        y = pickle.load(open('y_sax.p', 'rb'))
    else:
        X = pickle.load(open('X_reg.p', 'rb'))
        y = pickle.load(open('y_reg.p', 'rb'))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1234)

    ################################################################
    # Dimensionality reduction
    ################################################################

    def pca_reduce(X, dim):
        pca = PCA(n_components=dim)

        X_reduced = pca.fit_transform(X)
        return X_reduced

    def isomap_reduce(X, dim):
        iso = Isomap(n_components=dim)

        X_reduced = iso.fit_transform(X)
        return X_reduced

    def _find_best_dim_red(dims, model, model_name, X, y, params):

        rows_list = []
        for dim in dims:
            for f in [pca_reduce, isomap_reduce]:

                print(
                    'Start reducing dimensionality using {} to {} dimensions'.
                    format(f.__name__, dim))
                t0 = time.time()
                #reduce dimensionality
                #print(X.shape)
                X_red = f(X, dim)
                #print(X_red.shape)
                X_train_red, X_test_red, y_train, y_test = train_test_split(
                    X_red, y, test_size=0.2, random_state=1234)
                X_train_red = f(X_train_red, dim)
                X_test_red = f(X_test_red, dim)
                t1 = time.time()
                print('Reducing dimensions cost {} seconds'.format(t1 - t0))
                #Optimize model using grid search and cross validation
                print('Start optimizing {} model'.format(model_name))
                t0 = time.time()
                optimized_model = GridSearchCV(model,
                                               params,
                                               cv=10,
                                               refit=True)
                optimized_model.fit(X_train_red, y_train)
                t1 = time.time()
                print('Optimizing took {} seconds'.format(t1 - t0))
                #print('best found parameters')
                #print(optimized_model.best_params_)

                y_pred = optimized_model.predict(X_test_red)

                mse = sk.metrics.mean_squared_error(y_test, y_pred)
                print("MSE on test set: {}".format(mse))
                #administration
                rows_list.append({
                    'model':
                    deepcopy(model_name),
                    'dimensions':
                    deepcopy(dim),
                    'reduction technique':
                    deepcopy(f.__name__),
                    'mse':
                    deepcopy(mse),
                    'parameters':
                    str(deepcopy(optimized_model.best_params_))
                })
                #store model
                doc = open(
                    'performance-{}-days/models/{}-{}-{}.pickle'.format(
                        day, f.__name__, model_name, dim), 'wb')
                pickle.dump(optimized_model, doc)
                doc.close()

        adm_df = pd.DataFrame(rows_list)
        adm_df.to_csv('performance-{}-days/{}-dim_reduction.csv'.format(
            day, model_name))

    def dim_reduction_search(X, y):

        rf_params = {
            "max_depth": [3, None],
            "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'],
            "min_samples_split": [2, 3, 10],
            "min_samples_leaf": [1, 3, 10],
            "bootstrap": [True, False],
            "criterion": ["mse"]
        }

        ada_params = {
            'n_estimators': [10, 50, 100, 300, 500],
            'learning_rate': [1, 0.5, 0.1, 0.01, 0.001],
            'loss': ['linear', 'square', 'exponential']
        }

        dims = [10, 20, 30]

        _find_best_dim_red(dims, AdaBoostRegressor(), 'AdaBoost', X, y,
                           ada_params)
        _find_best_dim_red(dims, RandomForestRegressor(), 'RandomForest', X, y,
                           rf_params)

    #X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2)

    ##############################################################
    # Feature selection
    ##############################################################

    #Recursive Feature Elimination
    def ReFeEl(nr_features,
               X_train,
               y_train,
               X_test,
               y_test,
               estimator,
               nr_models=5):
        print('Start selecting features')
        t1 = time.time()
        #estimator = AdaBoostRegressor(learning_rate= 0.001, loss ='square', n_estimators  = 50)
        result = []
        for nr_feature in nr_features:
            selector = RFE(estimator, nr_feature, step=1)
            selector.fit(X_train, y_train)
            y_pred = selector.predict(X_test)
            mse = sk.metrics.mean_squared_error(y_test, y_pred)
            result.append((mse, selector))
        #sort models and take nr_models best ones
        result.sort(key=lambda x: x[0])
        result = result[:nr_models]

        t2 = time.time()
        print('selecting features took {} seconds'.format(t2 - t1))
        print(result[0][1].support_)
        print(result[0][1].ranking_)
        print('Minimum MSE: {}, number of selected features: {}'.format(
            result[0][0], len(result[0][1].support_[result[0][1].support_])))

        return result

    if FEATURE_REDUCTION:
        if not USE_SAX:
            estimator = AdaBoostRegressor(learning_rate=0.001,
                                          loss='square',
                                          n_estimators=50)
        else:
            estimator = AdaBoostRegressor(learning_rate=0.01,
                                          loss='linear',
                                          n_estimators=50)
        _, total_nr_features = X.shape
        nr_features = range(1, total_nr_features)

        opt_features_models = ReFeEl(nr_features, X_train, y_train, X_test,
                                     y_test, estimator)
        print(opt_features_models)

        with open('optimal_features_model_{}.pickle'.format(USE_SAX),
                  'wb') as f:
            pickle.dump(opt_features_models, f)

    # Feature Importance using Extra Trees
    def ET_feature_selection():
        estimator = ExtraTreesRegressor()
        estimator.fit(X, y)
        print(estimator.feature_importances_)
        print((len(estimator.feature_importances_[
            estimator.feature_importances_ < 0.01]),
               len(estimator.feature_importances_)))
        print(np.mean(estimator.feature_importances_))
        print(np.std(estimator.feature_importances_))

        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(estimator.feature_importances_)
        fig.savefig('boxplot.png', bbox_inches='tight')
        input('hallo')

    ##############################################################
    #Dimensionality reduction search
    ##############################################################
    if DIM_REDUCTION_SEARCH:
        dim_reduction_search(X, y)

    ##############################################################
    #Grid search + CV
    ##############################################################
    def append_deep_copy(rows_list, model_name, nr_features, mse, params):
        result = {
            'model': deepcopy(model_name),
            'nr_features': deepcopy(nr_features),
            'MSE': deepcopy(mse),
            'parameters': str(deepcopy(params))
        }
        rows_list.append(result)

        return rows_list

    params = {
        'RF': {
            "max_depth": [3, None],
            "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'],
            "min_samples_split": [2, 3, 10],
            "min_samples_leaf": [1, 3, 10],
            "bootstrap": [True, False],
            "criterion": ["mse"]
        },
        'AdaBoost': {
            'n_estimators': [10, 50, 100, 300, 500],
            'learning_rate': [1, 0.5, 0.1, 0.01, 0.001],
            'loss': ['linear', 'square', 'exponential']
        }
    }

    ##############################################################
    # Recursive Feature Elimination
    #############################################################

    for estimator in [(AdaBoostRegressor(), 'AdaBoost'),
                      (RandomForestRegressor(), 'RF')]:

        #Get features and store feature selector
        print('Optimizing {} using CV RFE'.format(estimator[1]))
        t0 = time.time()
        selector = RFECV(estimator[0], step=1, cv=10)
        selector = selector.fit(X_train, y_train)
        X_train_transformed = selector.transform(X_train)
        X_test_transformed = selector.transform(X_test)
        t1 = time.time()
        print('Optimizing features done in {} seconds, storing model..'.format(
            t1 - t0))
        #print('Selected features ({}): {}'.format(len(selector.get_support()[selector.get_support()]),selector.get_support()))
        doc = open(
            'performance-{}-days/models/RFE-{}-selector.pickle'.format(
                day, estimator[1]), 'wb')
        pickle.dump(selector, doc)
        doc.close()

        #Optimize hyperparameters and evaluate model
        print('Start optimizing hyperparameters using determined features...')
        t0 = time.time()
        opt_model = GridSearchCV(estimator[0],
                                 params[estimator[1]],
                                 cv=10,
                                 refit=True)
        opt_model.fit(X_train_transformed, y_train)
        t1 = time.time()

        print('Optimizing took {} seconds'.format((t1 - t0)))
        #print('best found parameters')
        #print(opt_model.best_params_)
        y_pred = opt_model.predict(X_test_transformed)
        mse = sk.metrics.mean_squared_error(y_test, y_pred)
        print("MSE on test set: {}".format(mse))
        model_doc = open(
            'performance-{}-days/models/RFE-{}-model.pickle'.format(
                day, estimator[1]), 'wb')
        pickle.dump(opt_model, model_doc)
        model_doc.close()

    ##############################################################
    # Feature Stability Selection
    #############################################################

    RL = RandomizedLasso(alpha='aic')
    print('Start optimizing using Randomized Lasso')
    t0 = time.time()
    RL.fit(X, y)
    t1 = time.time()
    print('Optimizing done in {} seconds'.format(t1 - t0))
    #print('Best parameters: {}'.format(RL.get_params()))
    #print('Best features: {}'.format(RL.get_support()))
    doc = open(
        'performance-{}-days/models/RandomizedLasso-selector.pickle'.format(
            day), 'wb')
    pickle.dump(RL, doc)
    doc.close()

    X_train_RL = RL.transform(X_train)
    X_test_RL = RL.transform(X_test)
    print('Using RL features to optimize model..')
    for estimator in [(AdaBoostRegressor(), 'AdaBoost'),
                      (RandomForestRegressor(), 'RF')]:
        print('Optimizing {} using CV RFE'.format(estimator[0]))
        t0 = time.time()
        opt_model = GridSearchCV(estimator[0],
                                 params[estimator[1]],
                                 cv=10,
                                 refit=True)
        opt_model.fit(X_train_RL, y_train)
        t1 = time.time()

        print('Optimizing took {} seconds'.format((t1 - t0)))
        #print('best found parameters')
        #print(opt_model.best_params_)
        y_pred = opt_model.predict(X_test_RL)
        mse = sk.metrics.mean_squared_error(y_test, y_pred)
        print("MSE on test set: {}".format(mse))
        model_doc = open(
            'performance-{}-days/models/RandomizedLasso-{}-model.pickle'.
            format(day, estimator[1]), 'wb')
        pickle.dump(opt_model, model_doc)
        model_doc.close()

# select the best features with true values and save them
features = pd.read_csv('all_features_training_filledna.csv',index_col=0).sort()
# targets_for_feature_selection.csv can be computed in opc_python/hulab/feature_selection
target = pd.read_csv('targets_for_feature_selection.csv',index_col=0).sort()#replace this with targets_for_feature_selection_LB_incl.csv if LB data is included
for i in range(21):
    print(attribute[i])
    sys.stdout.flush()
    
    
    Y = target[attribute[i]].dropna()
    X = features.loc[Y.index]
    selector = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=200,
                               random_state=25).fit(X,Y)
    selected = pd.DataFrame(selector.transform(features))
    selected.index = features.index
    print('shape ', selected.shape)
    
    selected.columns = X.ix[:,selector.scores_>0.025].columns

    f = open('scores_'+str(i)+'.txt', 'w')
    #nb_features = int(features.length())
    for x in range(0, len(features.columns.values)):
      feature_name = features.columns.values[x]
      f.write(feature_name+': ')
      score = selector.scores_[x]
      f.write(str(score))
      f.write('\n')

    f.close()
示例#10
0
def main():
    start = time.time()
    MAX_TRAIN_SIZE = 126838
    train_size = 20000
    val_size = MAX_TRAIN_SIZE - train_size
    data, test_data = get_data('data')
    X = data[0:train_size,0:-1]
    y = [lbl for lbl in data[0:train_size,-1]]
    print(X.shape)
    print(len(y))
    # use randomized log regression for feature selection    
    clfR = RandomizedLasso(     alpha='aic', 
                                scaling=0.5, 
                                sample_fraction=0.75, 
                                n_resampling=200, 
                                selection_threshold=0.25, 
                                fit_intercept=True, 
                                verbose=False, 
                                normalize=True, 
                                precompute='auto', 
                                max_iter=500, 
                                eps=2.2204460492503131e-16, 
                                random_state=None, 
                                n_jobs=1, 
                                pre_dispatch='3*n_jobs', 
                                #memory=Memory(cachedir=None)     
                          )  
    # fit regresion
    clfR.fit(X,y)

    # Transform Train Data to selected features
    X = np.array(X).copy() # little hack to fix assignment dest. read only error
    X_new = clfR.transform(X) 
    X = X_new
    ## transform Quiz Dataset
    test_data = np.array(test_data).copy() # little hack to fix assignment dest. read only error
    transformed_test_data = clfR.transform(test_data)
    test_data = transformed_test_data

    print('Dimensions after feature Reduction: ' + str(X.shape) ) 
    print("Elapsed Time For Feature Reduction: " + str(duration))
    
    # Training classifier
    clf1 = DecisionTreeClassifier(criterion='gini',
                                  splitter='best',
                                  max_depth=None,
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0,
                                  max_features=None,
                                  random_state=None,
                                  max_leaf_nodes=None,
                                  class_weight=None,
                                  presort=False)

    # fit sub-classifiers
    clf1.fit(X,y)
    # fit voting classifier
    print("Elapsed Time For Classifier Training: " + str(duration))

    # predict & calculate training error
    y_hat = clf1.predict(X)
    test_err = 1
    for yi, y_hati in zip(y, y_hat):
        test_err += (yi == y_hati)
    test_err /= train_size
    print("train: " + str(test_err))

    # validation data - calculate valdiation error
    val_start = train_size
    val_end = train_size + val_size

    # get validation data set
    # TODO: put this back in
    if MAX_TRAIN_SIZE - train_size > val_size:
         print("Beginning test validation...")
         X_val = data[val_start:val_end,0:-1]
         y_val = [lbl for lbl in data[val_start:val_end,-1]]
         y_val_hat = clf1.predict(X_val)
         test_err = 1
         for yi, y_hati in zip(y_val, y_val_hat):
             test_err += (yi == y_hati)
         test_err /= X_val.shape[0]
         print("val: " + str(test_err))

    #quiz data
    print("Beginning quiz validation...")
    # test_data = get_data('quiz')
    X_test = test_data[:,:]
    print(X_test.shape)
    y_test = [lbl for lbl in data[:,-1]]
    y_test_hat = clf1.predict(X_test)
    test_err = 1
#    for yi, y_hati in zip(y_test, y_test_hat):
#        test_err += (yi == y_hati)
#    test_err /= X_test.shape[0]
#    print("test: " + str(test_err))
    store_csv(y_test_hat, "prediction")
    end = time.time()
    duration = end - start
    print("Took this many seconds: " + str(duration))