示例#1
0
def _get_base_ml_model(method):
    regressor = None
    if method == 'lr':
        regressor = linear_model.LinearRegression()
    if method == 'huber':
        regressor = linear_model.HuberRegressor(max_iter=50)
        regressor = multioutput.MultiOutputRegressor(regressor)
    if method == 'svr':
        regressor = svm.LinearSVR()
        regressor = multioutput.MultiOutputRegressor(regressor)
    if method == 'kr':
        regressor = kernel_ridge.KernelRidge(kernel='rbf')
    if method == 'rf':
        regressor = ensemble.RandomForestRegressor(n_estimators=50, n_jobs=8)
    if method == 'gbm':
        regressor = lgb.LGBMRegressor(max_depth=20,
                                      num_leaves=1000,
                                      n_estimators=100,
                                      min_child_samples=5,
                                      random_state=42)
        regressor = multioutput.MultiOutputRegressor(regressor)
    if method == 'nn':
        regressor = neural_network.MLPRegressor(hidden_layer_sizes=(25, 25),
                                                early_stopping=True,
                                                max_iter=1000000,
                                                alpha=0.01)

    return regressor
示例#2
0
def modelsRegression():
    return [    
            tree.DecisionTreeRegressor(),                    
            ensemble.RandomForestRegressor(n_estimators=10),
            
            multioutput.MultiOutputRegressor(tree.DecisionTreeRegressor()),
            multioutput.MultiOutputRegressor(svm.SVR(kernel='rbf')),
            multioutput.MultiOutputRegressor(ensemble.AdaBoostRegressor()),
            multioutput.MultiOutputRegressor(ensemble.RandomForestRegressor(n_estimators=10)),
            
            multioutput.RegressorChain(tree.DecisionTreeRegressor()),
            multioutput.RegressorChain(svm.SVR(kernel='rbf')),
            multioutput.RegressorChain(ensemble.AdaBoostRegressor()),
            multioutput.RegressorChain(ensemble.RandomForestRegressor(n_estimators=10))            
            ]
示例#3
0
def retrain_the_best_model(x_train_val, y_train_val, model_dir):
    print("Retraining the best model")

    model_filepath = os.path.join(model_dir, f"best_KNN_model")
    if os.path.exists(model_filepath):
        with open(model_filepath, "rb") as model_file:
            best_model = pickle.load(model_file)
            return best_model

    train_data = pd.read_csv(
        os.path.join(model_dir, f"KNN_model_search_group_results.csv"))
    avg_r2_score = train_data["avg r2 score"]
    idx = np.argmax(avg_r2_score)

    best_data = train_data.iloc[idx]
    best_params = {
        "n_neighbors": int(best_data["n_neighbors"]),
        "weights": best_data["weights"],
    }
    if best_data["distance"] == "euclidean" or best_data[
            "distance"] == "manhattan":
        best_params["metric"] = best_data["distance"]
    elif best_data["distance"] == "lorentzian":
        best_params["metric"] = lorentzian_distance
    elif best_data["distance"] == "angular":
        best_params["metric"] = angular_distance
    else:
        raise ValueError(f"Unknown metric: {best_data['distance']}")

    print(f"\tBest params: {best_params}")
    best_model = multioutput.MultiOutputRegressor(
        KNeighborsRegressor(**best_params, algorithm='brute', n_jobs=-1))
    best_model.fit(x_train_val, y_train_val)

    return best_model
示例#4
0
 def model(self, **kwargs):
     C = kwargs.get('C', 1)
     tol = kwargs.get('tol', 1e-4)
     penalty = kwargs.get('penalty', 'l2')
     return multioutput.MultiOutputRegressor(
         linear_model.LogisticRegression(C=C, tol=tol, penalty=penalty)
     )
示例#5
0
def retrain_the_best_model(x_train_val, y_train_val, model_dir):
    print("Retraining the best model")

    model_filepath = os.path.join(model_dir, "best_RF_model")
    if os.path.exists(model_filepath):
        with open(model_filepath, "rb") as model_file:
            best_model = pickle.load(model_file)
            return best_model

    train_data = pd.read_csv(
        os.path.join(model_dir, f"RF_model_search_group_results.csv"))
    avg_r2_score = train_data["avg r2 score"]
    idx = np.argmax(avg_r2_score)

    best_data = train_data.iloc[idx]
    best_params = {
        "n_estimators":
        int(best_data["n_estimators"]),
        "min_samples_split":
        int(best_data["min_samples_split"]) if
        best_data["min_samples_split"] >= 1 else best_data["min_samples_split"]
    }

    print(f"\tBest params: {best_params}")
    best_model = multioutput.MultiOutputRegressor(
        RandomForestRegressor(**best_params, n_jobs=-1))
    best_model.fit(x_train_val, y_train_val)

    return best_model
示例#6
0
def build_model(model_type, num_targets = 1):
    if model_type == 'linear_regression':
        base = linear_model.SGDRegressor()
    elif model_type == 'random_forests':
        base = ensemble.RandomForestRegressor()
    elif model_type == 'gradient_boosting':
        base = ensemble.GradientBoostingRegressor()
    elif model_type == 'extra_trees':
        base = ensemble.ExtraTreesRegressor()
    elif model_type == 'bagging':
        base = ensemble.BaggingRegressor()
    elif model_type == 'adaboost':
        base = ensemble.AdaBoostRegressor()
    elif model_type == 'neural_network':
        base = neural_network.MLPRegressor()
    elif model_type == 'svm':
        base = svm.SVR(verbose=1)
    elif model_type == 'constant_mean':
        base = dummy.DummyRegressor('mean')
    elif model_type == 'constant_median':
        base = dummy.DummyRegressor('median')
    elif model_type == 'constant_zero':
        base = dummy.DummyRegressor('constant', constant=0)
    else:
        raise(ValueError('invalid model type: {}'.format(model_type)))

    # multiple outputs in the dataset => fit a separate regressor to each
    if num_targets > 1:
        return multioutput.MultiOutputRegressor(base)
    else:
        return base
	def getPreProcessor(self):
		if self.problem == 'classification':
			print(self.problem)
			self.svmMethod = svm.SVC
			self.pipe = pline.Pipeline([
				('scaler',pre.StandardScaler()),
				('clf',self.svmMethod(kernel='rbf',C=100,gamma=0.01))
			])
		elif self.problem == 'regression':
			print(self.problem)
			if isinstance(self.classes,pd.DataFrame):
				est = svm.SVR(kernel='rbf',C=100,gamma=0.01)
				self.svmMethod = mo.MultiOutputRegressor(est)
			else:
				self.svmMethod = svm.SVR(kernel='rbf',C=100,gamma=0.01)
			self.pipe = pline.Pipeline([
				('scaler',pre.StandardScaler()),
				('reg',self.svmMethod)
			])
		else:
			print('Pré-Processador não reconhece esse tipo de problema, tente outro parâmetro')
			return False

		# self.cutNull()
		optParams = self.pipeOptimizer()
		print("BEST PARAMS AND SCALER: \n {}".format(optParams))
		return self.pipe
示例#8
0
 def model(self, **kwargs):
     loss = kwargs.get('loss', 'squared_loss')
     penalty = kwargs.get('penalty', 'l2')
     max_iter = kwargs.get('max_iter', 5)
     return multioutput.MultiOutputRegressor(
         linear_model.SGDRegressor(penalty=penalty,
                                   loss=loss,
                                   max_iter=max_iter))
示例#9
0
def multioutput_regression_example():
    X, y = datasets.make_regression(n_samples=10, n_targets=3, random_state=1)

    regr = ensemble.GradientBoostingRegressor(random_state=0)

    mo_regr = multioutput.MultiOutputRegressor(regr)
    mo_regr.fit(X, y)
    pred = mo_regr.predict(X)
    print('Prediction =\n', pred)
示例#10
0
文件: model.py 项目: Jamezzz5/mmlm
 def set_regressor(reg):
     if reg == 'RandomForest':
         reg = RandomForestRegressor()
     if reg == 'Linear':
         reg = lm.LinearRegression()
     if reg == 'BayesianRidge':
         reg = mo.MultiOutputRegressor(lm.BayesianRidge())
     if reg == 'MLP':
         reg = MLPRegressor()
     return reg
示例#11
0
    def fit_and_predict(self):
        """
        Insert your models here to do the work of predicting on the training set.
        """

        lifted_train_data, lifted_test_data = self.lift_data(
            self.X_tr, self.test_set)
        lifted_train_data, lifted_test_data = self.standardize_data(
            lifted_train_data, lifted_test_data)
        self.X_tr, self.test_set = self.standardize_data(
            self.X_tr, self.test_set)

        #Here are some good models, which you might want to use (they're not quite as good as GradientBoostingRegressors)
        #the models are multi_Ada, multi_ridge_chain, and tree.
        #All of them predict both latitude and longitude at the same time, rather than one and then the other
        #Use multioutput.MultiOutputRegressor() or multioutput.RegressorChain() to get multiple predictions at once
        #Tree models don't need to be fit on lifted data. Only ridge needs to be fit on lifted data.
        #multi_grad is a GradientBoostingRegressor that does pretty well. Yours might be better, though.
        ridge = linear_model.Ridge(alpha=2.0, tol=0.0001)
        multi_ridge_chain = multioutput.RegressorChain(base_estimator=ridge,
                                                       order=[0, 1])
        tree = sklearn.tree.DecisionTreeRegressor(max_depth=9,
                                                  min_samples_split=200,
                                                  min_samples_leaf=100)

        Ada_tree = sklearn.tree.DecisionTreeRegressor(max_depth=2,
                                                      min_samples_split=100,
                                                      min_samples_leaf=50)
        AdaBoost = ensemble.AdaBoostRegressor(base_estimator=Ada_tree,
                                              learning_rate=0.02,
                                              n_estimators=60)
        multi_Ada = multioutput.MultiOutputRegressor(estimator=AdaBoost,
                                                     n_jobs=3)

        gradient_boost = ensemble.GradientBoostingRegressor(
            learning_rate=0.065, n_estimators=500)
        multi_grad = multioutput.MultiOutputRegressor(estimator=gradient_boost,
                                                      n_jobs=3)

        self.print_txt()
示例#12
0
def compute_feature_importance(model, dataset, output_path):

    logger.info("Computing feature importance for individual parameters...")
    regr = multioutput.MultiOutputRegressor(model, n_jobs=1)
    regr.fit(dataset.training_x, dataset.training_y)

    fig = plot.feature_importances(forests=[i.rf for i in regr.estimators_] +
                                   [model.rf],
                                   names=dataset.names + ["joint prediction"],
                                   colors=dataset.colors + ["C0"])

    fig.savefig(os.path.join(output_path, "feature_importances.pdf"),
                bbox_inches='tight')
	def pipeOptimizer(self):
		print("{} - STARTING PIPE OPTIMIZATION... \n".format(datetime.datetime.now()))

		# if preType == 'minmax':
		# 	chosenPre = pre.MinMaxScaler()
		# else:
		# 	chosenPre = pre.StandardScaler()
		optParams = self.testSVMParams(self.pipe)
		scaler = self.testScaler()

		if self.problem == 'classification':
			method = self.svmMethod(kernel=optParams['clf__kernel'],C=optParams['clf__C'],gamma=optParams['clf__gamma'])
		elif self.problem == 'regression':
			if isinstance(self.classes,pd.DataFrame):	
				method = mo.MultiOutputRegressor(svm.SVR(kernel=optParams['reg__estimator__kernel'],C=optParams['reg__estimator__C'],gamma=optParams['reg__estimator__gamma']))
			else:
				method = svm.SVR(kernel=optParams['reg__kernel'],C=optParams['reg__C'],gamma=optParams['reg__gamma'])

		self.pipe = pline.Pipeline([
			('scaler',scaler[0]),
			('clf',method)
		])
		optParams['scaler'] = scaler[1]
		return optParams
示例#14
0
def trainData(dir):
    """
    Using all the files in dir, train the data 
    """
    dir = dir + r"/pics/"
    print(dir)
    imgList = glob.glob(dir + r"*.jpg")
    imgList.extend(glob.glob(dir + r"*.jpeg"))
    imgList.extend(glob.glob(dir + r"*.png"))
    imgList.extend(glob.glob(dir + r"*.jfif"))
    dumpingGround = filedialog.askdirectory(
        parent=root, initialdir="/", title='Please select a dumping ground')
    #dumpingGround=r"/home/stu2/s15/dl1683/Courses/img/TrainedData"
    for img in imgList:
        processImg(img, dumpingGround)
        print("Moved imgs")

    for i in range(0, len(trainers)):
        multioutput.MultiOutputRegressor(clfB[i]).fit(x, pixels)
        print("Fitting blue")
        print("Dumping", names[i])
        joblib.dump(clfR[i], dumpingGround + r"/trainRed" + (names[i])[:10])
        joblib.dump(clfB[i], dumpingGround + r"/trainBlue" + (names[i])[:10])
        joblib.dump(clfG[i], dumpingGround + r"/trainGreen" + (names[i])[:10])
示例#15
0
    parser.add_argument(
        '--svrparams',
        type=str,
        dest='svr_params',
        required=False,
        help='Parameters of Epsilon-Support Vector Regressor constructor')

    args = parser.parse_args()

    print("#### Started %s ####" % os.path.basename(__file__))

    head_train, df_independent_train, df_dependent_train = read_csv_dataset(
        args.train_dataset_filename, args.num_of_dependent_columns)

    svr_kwargs = prepare_kwargs_for_regressor(args)
    model = sklmo.MultiOutputRegressor(sklsvm.SVR(**svr_kwargs))

    start_time = time.time()
    model.fit(df_independent_train, df_dependent_train)
    elapsed_time = time.time() - start_time
    print("Training time:", time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))

    jl.dump(model, args.model_file)
    print(
        "Generated one-variable function Epsilon-Support Vector Regressor model '%s'"
        % args.model_file)

    print("#### Terminated %s ####" % os.path.basename(__file__))
        required=False,
        help='Dump directory (directory to store metric values)')

    parser.add_argument('--xgbparams',
                        type=str,
                        dest='xgb_params',
                        required=False,
                        help='Parameters of XGBoost constructor')

    args = parser.parse_args()

    print("#### Started %s ####" % os.path.basename(__file__))

    head_train, df_independent_train, df_dependent_train = read_csv_dataset(
        args.train_dataset_filename, args.num_of_dependent_columns)

    xgb_kwargs = prepare_kwargs_for_regressor(args)
    model = sklmo.MultiOutputRegressor(xgb.XGBRegressor(**xgb_kwargs))

    start_time = time.time()
    model.fit(df_independent_train, df_dependent_train)
    elapsed_time = time.time() - start_time
    print("Training time:", time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))

    jl.dump(model, args.model_file)
    print("Generated one-variable function xgboost model '%s'" %
          args.model_file)

    print("#### Terminated %s ####" % os.path.basename(__file__))
示例#17
0
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.33, random_state=0)

# First, PCA 2-D (which has .transform()) to illustrate and evaluate
lens = decomposition.PCA(n_components=2, random_state=0)
X_lens_train = lens.fit_transform(X_train)
X_lens_test = lens.transform(X_test)

# Normalize the lens within 0-1
scaler = preprocessing.MinMaxScaler()
X_lens_train = scaler.fit_transform(X_lens_train)
X_lens_test = scaler.transform(X_lens_test)

# Fit a model and predict the lens values from the original features
model = XGBRegressor(n_estimators=2000, max_depth=20, learning_rate=0.01)
model = multioutput.MultiOutputRegressor(model)
model.fit(X_train, X_lens_train)
preds = model.predict(X_test)

# Evaluate exhaustively
print("PREDICTION\t\tGROUND TRUTH")
for p, g in zip(preds, X_lens_test):
    print(p, g)
print("MAE", metrics.mean_absolute_error(X_lens_test, preds))

# Now TSNE (which has no .transform()) and a visual evaluation
lens = manifold.TSNE(n_components=2, init='pca', random_state=0)
X_lens_train = lens.fit_transform(X_train)

# Normalize the lens within 0-1
X_lens_train = scaler.fit_transform(X_lens_train)
示例#18
0
def predict_atlas(fpaths_refspace_train,
                  fpaths_secspace_train,
                  fpaths_refspace_predict,
                  outlier_removal_ref=None,
                  outlier_removal_sec=None,
                  outlier_removal_cov=None,
                  covariates_to_use=None,
                  regressor='MO-SVR',
                  n_jobs=1,
                  save_predictions=False,
                  save_pipeline=False,
                  verbose=False,
                  outlier_options_ref={},
                  outlier_options_sec={},
                  outlier_options_cov={},
                  regressor_options={'kernel': 'rbf'},
                  pipeline_options={
                      'zscore_X': False,
                      'zscore_y': False,
                      'pca_X': False,
                      'pca_y': False,
                      'rezscore_X': False,
                      'rezscore_y': False,
                      'subselect_X': None,
                      'subselect_y': None,
                      'add_covariates': None
                  }):
    """Predict a secondary channel feature space by fitting an atlas regression
    model on paired "secondary channel - reference channel" training data and
    then performing regression on "reference channel"-only test data.

    Input data is retrieved from files specified in lists of file paths and the
    predicted output data is written to the corresponding paths, appropriately
    named and tagged as 'PREDICTED'.

    The channel names for the predicted channels are added to the metadata
    channels index (also tagged as 'PREDICTED') and the full atlas regression
    objects are also added to the metadata.

    Parameters
    ----------
    fpaths_refspace_train : single string or list of strings
        A path or list of paths (either local from cwd or global) to npy files
        containing training feature space data for the reference channel used
        as the basis of prediction (usually the shape space).
    fpaths_secspace_train : single string or list of strings
        A path or list of paths (either local from cwd or global) to npy files
        containing training feature space data for the secondary channel that
        is to be the target of the regression.
    fpaths_refspace_predict : single string or list of strings
        A path or list of paths (either local from cwd or global) to npy files
        containing prediction feature space data for the reference channel
        based on which the target secondary channel will be predicted
    outlier_removal_ref : string or None, optional, default None
        If None, no outlier removal is done on the reference feature space.
        Otherwise this must be a string denoting the method for outlier removal
        (one of `absolute_thresh`, `percentile_thresh`,
        `merged_percentile_thresh` or `isolation_forest`). Note that outlier
        removal is only done on training data, not on prediction data.
        See katachi.utilities.outlier_removal.RemoveOutliers for more info.
    outlier_removal_sec : string or None, optional, default None 
        If None, no outlier removal is done on the target feature space.
        Otherwise this must be a string denoting the method for outlier removal
        (see outlier_removal_ref above).
    outlier_removal_cov : string or None, optional, default None
        If None, no outlier removal is done based on covariate information.
        Otherwise this must be a string denoting the method for outlier removal
        (see outlier_removal_ref above).
    covariates_to_use : string, list of strings or None, optional, default None
        A string denoting the selection tree to select a covariate to be used
        for outlier detection from the HierarchicalData covariate object. Can
        also be a list of multiple such strings, in which case the covariates
        are merged into an fspace. The specified covariates must each be single
        numeric columns.
    regressor : string or sklearn regressor instance, optional, default 'MO-SVR'
        If a string, must be one of 'MO-SVR', 'MT-ENetCV', 'MT-Lasso', 'MLP'. 
        In the first case a multioutput SVR is used for regression, in the 
        second a Multi-Task Elastic Net with Cross Validation, in the third a 
        Multi-Task Lasso linear regression, and in the fourth a Multi-Layer 
        Perceptron. If an sklearn(-like) regressor instance is passed, it 
        must be a multivariate-multivariable regressor that supports the fit 
        and predict methods.
    n_jobs : int, optional, default 1
        Number of processes available for use during multi-processed model
        fitting and prediction. Works for 'MO-SVR', 'MT-ENetCV' and 'MT-Lasso' 
        regressors.
        WARNING: The 'MLP' regressor also performs multi-processing but does
        not seem to support an n_jobs argument.
    save_predictions : bool, optional, default False
        If True, the predictions are saved in the corresponding paths and the
        metadata is updated.
    save_pipeline : bool, optional, default False
        If True, the atlas pipeline object is saved in the corresponding paths
        as a separate file with the name `<prim_ID>_atlas_pipeline.pkl`.
    verbose : bool, optional, default False
        If True, more information is printed.
    outlier_options_ref : dict, optional, default {}
        kwarg dictionary for the chosen outlier removal method to be applied
        to the reference feature space.
        See katachi.utilities.outlier_removal.RemoveOutliers for more info.
    outlier_options_sec : dict, optional, default {}
        kwarg dictionary for the chosen outlier removal method to be applied
        to the target feature space.
        See katachi.utilities.outlier_removal.RemoveOutliers for more info.
    outlier_options_cov : dict, optional, default {}
        kwarg dictionary for the chosen outlier removal method to be applied
        to the covariates. There default is to fall back to the defaults of
        katachi.utilities.outlier_removal.RemoveOutliers.
    regressor_options : dict, optional, default is a standard RBF MO-SVR
        kwarg dictionary for the chosen regressor's instantiation.
        See the chosen regressor's doc string for more information.
    pipeline_options : dict, optional, default is no additional processing
        kwarg dictionary for AtlasPipeline instantiation.
        See the AtlasPipeline doc string for more information.

    Returns
    -------
    secspace_predict : array of shape (n_predict_samples, n_secspace_features)
        Predicted secondary channel feature space.
    refspace_predict_idx : array of shape (n_predict_samples)
        Index array mapping rows (cells) of secspace_predict to paths (prims)
        in fpaths_refspace_predict.
    atlas_pipeline : predict_atlas.AtlasPipeline instance
        Fitted instance of the regressor pipeline.
    """

    #--------------------------------------------------------------------------

    ### Load data

    if verbose: print "\n# Loading data..."

    # Handle cases of single paths for training data
    if type(fpaths_secspace_train) == str and type(
            fpaths_refspace_train) == str:
        fpaths_secspace_train = [fpaths_secspace_train]
        fpaths_refspace_train = [fpaths_refspace_train]
    elif (type(fpaths_secspace_train) == str
          or type(fpaths_refspace_train) == str
          or len(fpaths_secspace_train) != len(fpaths_refspace_train)):
        raise IOError("Different number of secondary and reference space " +
                      "input file paths specified.")

    # Handle cases of single paths for prediction data
    if type(fpaths_refspace_predict) == str:
        fpaths_refspace_predict = [fpaths_refspace_predict]

    # Load training data
    secspace_train = []
    refspace_train = []
    for secpath, refpath in zip(fpaths_secspace_train, fpaths_refspace_train):
        secspace_train.append(np.load(secpath))
        refspace_train.append(np.load(refpath))
    secspace_train = np.concatenate(secspace_train, axis=0)
    refspace_train = np.concatenate(refspace_train, axis=0)

    # Check that everything is fine
    if not secspace_train.shape[0] == refspace_train.shape[0]:
        raise IOError("Secondary and reference space do not have the same " +
                      "number of cells.")

    # Load prediction data
    refspace_predict = []
    refspace_predict_idx = []
    for idx, refpath in enumerate(fpaths_refspace_predict):
        refspace_predict.append(np.load(refpath))
        refspace_predict_idx.append(
            [idx for v in range(refspace_predict[-1].shape[0])])
    refspace_predict = np.concatenate(refspace_predict, axis=0)
    refspace_predict_idx = np.concatenate(refspace_predict_idx, axis=0)

    # Check that everything is fine
    if not refspace_train.shape[1] == refspace_predict.shape[1]:
        raise IOError("Reference feature spaces for training and prediction " +
                      "do not have the same number of features!")

    # Handle covariate loading
    if outlier_removal_cov is not None:

        # Sanity checks
        if covariates_to_use is None:
            raise IOError(
                "When outlier_removal_cov is not None, covariates " +
                "to use for determining outliers must be specified " +
                "in covariates_to_use!")

        # Handle single covariates
        if type(covariates_to_use) == str:
            covariates_to_use = [covariates_to_use]

        # Load covariates
        covars = []
        for refpath in fpaths_refspace_train:

            # Create covarpath
            revdir, reffile = os.path.split(refpath)
            covpath = os.path.join(revdir, reffile[:10] + '_covariates.pkl')

            # Load covar file
            with open(covpath, 'rb') as covfile:
                covtree = pickle.load(covfile)

            # Get relevant covariates
            covs2use = []
            for c2u in covariates_to_use:
                covs2use.append(np.expand_dims(covtree._gad(c2u), -1))
            covs2use = np.concatenate(covs2use, axis=1)

            # Add to other samples
            covars.append(covs2use)

        # Concatenate
        covars = np.concatenate(covars)

    #--------------------------------------------------------------------------

    ### Prepare regressor

    # Report
    if verbose: print "\n# Preparing regressor..."

    # Multi-Output Support Vector Regression with RBF Kernel
    if regressor == 'MO-SVR':
        svr = svm.SVR(**regressor_options)
        regressor = multioutput.MultiOutputRegressor(svr, n_jobs=n_jobs)

    # Multi-task Elastic Net Regression with Cross Validation
    elif regressor == 'MT-ENetCV':
        regressor = linear_model.MultiTaskElasticNetCV(random_state=42,
                                                       n_jobs=n_jobs)

    # Multivariate-Multivariable Linear Regression by Multi-Task Lasso
    elif regressor == 'MT-Lasso':
        regressor = linear_model.MultiTaskLassoCV(random_state=42,
                                                  n_jobs=n_jobs,
                                                  **regressor_options)

    # Multi-Layer Perceptron Regressor
    elif regressor == 'MLP':
        regressor = neural_network.MLPRegressor(random_state=42,
                                                **regressor_options)

    # Other regressor strings
    elif type(regressor) == str:
        raise ValueError('Regressor not recognized.')

    # Regressor object given as argument
    else:

        # Check if object has fit method
        fit_attr = getattr(regressor, "fit", False)
        if not callable(fit_attr):
            raise ValueError("Regressor object has no 'fit' method.")

        # Check if object has predict method
        predict_attr = getattr(regressor, "predict", False)
        if not callable(predict_attr):
            raise ValueError("Regressor object has no 'predict' method.")

    #--------------------------------------------------------------------------

    ### Remove outliers from training data

    # Find and remove outliers based on covariate values
    if outlier_removal_cov is not None:

        # Report
        if verbose:
            print "\n# Removing outliers based on covariates..."
            print "Started with %i," % refspace_train.shape[0],

        # Find and remove outliers
        orem_cov = RemoveOutliers(outlier_removal_cov, **outlier_options_cov)
        orem_cov.fit(covars)
        covars, (refspace_train, secspace_train) = orem_cov.transform(
            covars, [refspace_train, secspace_train])

        # Report
        if verbose:
            print "removed %i, kept %i samples" % (orem_cov.X_removed_,
                                                   refspace_train.shape[0])

    # Find and remove outliers based on reference space
    if outlier_removal_ref is not None:

        # Report
        if verbose:
            print "\n# Removing reference outliers..."
            print "Started with %i," % refspace_train.shape[0],

        # Find and remove outliers
        orem_ref = RemoveOutliers(outlier_removal_ref, **outlier_options_ref)
        orem_ref.fit(refspace_train)
        refspace_train, secspace_train = orem_ref.transform(
            refspace_train, secspace_train)

        # Report
        if verbose:
            print "removed %i, kept %i samples" % (orem_ref.X_removed_,
                                                   refspace_train.shape[0])

    # Find and remove outliers based on secondary space
    if outlier_removal_sec is not None:

        # Report
        if verbose:
            print "\n# Removing target outliers..."
            print "Started with %i," % refspace_train.shape[0],

        # Find and remove outliers
        orem_sec = RemoveOutliers(outlier_removal_sec, **outlier_options_sec)
        orem_sec.fit(secspace_train)
        secspace_train, refspace_train = orem_sec.transform(
            secspace_train, refspace_train)

        # Report
        if verbose:
            print "removed %i, kept %i samples" % (orem_sec.X_removed_,
                                                   refspace_train.shape[0])

    #--------------------------------------------------------------------------

    ### Fit and predict

    # Construct pipeline
    atlas_pipeline = AtlasPipeline(regressor,
                                   verbose=verbose,
                                   **pipeline_options)

    # Fit
    if verbose: print "\n# Fitting..."
    atlas_pipeline.fit(refspace_train, secspace_train)

    # Predict
    if verbose: print "\n# Predicting..."
    secspace_predict = atlas_pipeline.predict(refspace_predict)

    #--------------------------------------------------------------------------

    ### Update the metadata

    if save_predictions:

        if verbose: print "\n# Saving metadata..."

        # For each path...
        for idx, refpath in enumerate(fpaths_refspace_predict):

            # Load metadata file
            refdir, reffname = os.path.split(refpath)
            prim_ID = reffname[:10]
            metapath = os.path.join(refdir, prim_ID + "_stack_metadata.pkl")
            with open(metapath, "rb") as metafile:
                metadict = pickle.load(metafile)

            # Construct channel designation
            pattern = re.compile("8bit_(.+?(?=_))")
            secpath = fpaths_secspace_train[0]
            channel = re.search(pattern, secpath).group(1) + "_PREDICTED"

            # Add channel to metadata
            if not channel in metadict["channels"]:
                metadict["channels"].append(channel)

            # Save metadata
            with open(metapath, "wb") as outfile:
                pickle.dump(metadict,
                            outfile,
                            protocol=pickle.HIGHEST_PROTOCOL)

    #--------------------------------------------------------------------------

    ### Save fitted atlas pipeline as separate metadata file

    if save_pipeline:

        if verbose: print "\n# Saving pipeline..."

        # For each path...
        for idx, refpath in enumerate(fpaths_refspace_predict):

            # Load atlas metadata file if it exists
            refdir, reffname = os.path.split(refpath)
            prim_ID = reffname[:10]
            atlaspath = os.path.join(refdir, prim_ID + "_atlas_pipeline.pkl")
            if os.path.isfile(atlaspath):
                with open(atlaspath, "rb") as atlasfile:
                    atlasdict = pickle.load(atlasfile)
            else:
                atlasdict = {}

            # Construct designation
            pattern = re.compile("8bit_(.+?(?=\.))")
            secpath = fpaths_secspace_train[0]
            atlasname = re.search(pattern, secpath).group(1) + "_ATLASPIP"

            # Add pipeline to dict
            atlasdict[atlasname] = atlas_pipeline

            # Save atlas dict
            with open(atlaspath, "wb") as outfile:
                pickle.dump(atlasdict,
                            outfile,
                            protocol=pickle.HIGHEST_PROTOCOL)

    #--------------------------------------------------------------------------

    ### Save the predictions

    if save_predictions:

        if verbose: print "\n# Saving predictions..."

        # For each path...
        for idx, refpath in enumerate(fpaths_refspace_predict):

            # Construct outpath
            to_replace = refpath[refpath.index("8bit_") + 5:]
            secpath = fpaths_secspace_train[0]
            replace_by = secpath[secpath.index("8bit_") + 5:]
            replace_by = replace_by[:-4] + "_PREDICTED.npy"
            outpath = refpath.replace(to_replace, replace_by)

            # Write file
            np.save(outpath, secspace_predict[refspace_predict_idx == idx])

    #--------------------------------------------------------------------------

    ### Return results

    # Report
    if verbose: print "\nDone!"

    # Return
    return secspace_predict, refspace_predict_idx, atlas_pipeline
示例#19
0
 def model(self, **kwargs):
     C = kwargs.get('C', 100)
     gamma = kwargs.get('gamma', 0.01)
     kernel = kwargs.get('kernel', 'rbf')
     return multioutput.MultiOutputRegressor(
         svm.SVR(C=C, gamma=gamma, kernel=kernel))
    images, parameters = pickle.load(g)

parameters_regressor = []
for params in parameters:
    (x, y), (MA, ma), angle = params
    parameters_regressor.append([x, y, MA, ma, angle])
X_train, X_test, y_train, y_test = train_test_split(images,
                                                    parameters_regressor,
                                                    test_size=0.3,
                                                    random_state=42)

n_estimator = 300

# clf = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=n_estimator, verbose=2, warm_start=False)
clf = AdaBoostRegressor(random_state=0, n_estimators=n_estimator)
clf = multioutput.MultiOutputRegressor(clf, n_jobs=None)
clf.fit(X_train, y_train)

filename = 'regression_seed42_adaboost' + str(n_estimator) + '_estimators.sav'
pickle.dump(clf, open(filename, 'wb'))

clf = pickle.load(open(filename, 'rb'))

y_predict = clf.predict(X_test)
iterator = np.arange(0, len(y_predict), 1)

Xdif = 0
Ydif = 0
MAdif = 0
madif = 0
angledif = 0
示例#21
0
    ensemble.GradientBoostingRegressor(),
    ensemble.ExtraTreesRegressor(),

    #     Nearest Neighbor

    #     svm
    svm.LinearSVR(),
    #     svm.SVR(),
    #     svm.NuSVR(),

    # tree
    tree.DecisionTreeRegressor(),
]

for alg in MLA:
    alg_name = alg.__class__.__name__
    print("starting:", alg_name)
    clf = multioutput.MultiOutputRegressor(alg)
    clf.fit(train_X, train_y)

    print("\ttrainig score:", clf.score(train_X, train_y))
    print("\ttesting score:", clf.score(test_X, test_y))
    print(
        "\ttrainig mse:",
        metrics.mean_squared_error(y_true=train_y,
                                   y_pred=clf.predict(train_X)))
    print(
        "\ttesting mse:",
        metrics.mean_squared_error(y_true=test_y, y_pred=clf.predict(test_X)))
    print('-' * 50)
示例#22
0
expected = pd.read_csv('training.csv')
expected = pd.merge(expected,
                    pred1,
                    left_on='image_name',
                    right_on='image_name',
                    how='right')
expected = expected.drop(
    columns=['image_name', 'x1_y', 'x2_y', 'y1_y', 'y2_y'])

X = X.values
Y = expected.values

xgb_model = xgb.XGBRegressor(silent=False)

ensemble_model = multioutput.MultiOutputRegressor(estimator=xgb_model)
ensemble_model.fit(X, Y)

dump(ensemble_model, 'ensemble_model.joblib.dat')

pred1 = pd.read_csv('data/prediction.csv')
pred2 = pd.read_csv('data/rfcn_resnet101_final_test.csv')
Jah = pred1
XXX = pred2

X = pd.merge(pred1,
             pred2,
             left_on='image_name',
             right_on='image_name',
             how='left')
names = X['image_name']
示例#23
0
def train(
    data: tuple[np.ndarray, np.ndarray],
    model="BayesianRidge",
    n_estimators=100,
    alpha=0.0001,
    alpha_1=1.0e-6,
    alpha_2=1.0e-6,
    lambda_1=1.0e-6,
    lambda_2=1.0e-6,
    n_iter=300,
    epsilon=1.35,
    alphas=[0.1, 0.5, 1],
    gcv_mode="auto",
    solver="auto",
    n_hidden=20,
    rbf_width=0,
    activation_func="selu"
    #  load_trained_model=0, update_trained_model=1, save_model=1, saved_model_path_string='stored_models',
) -> Any:
    """Sklearn model. Models as input parameter. Can be linear, ridge, Huber or much more.
    It also contain extreme learning machine model from sklearn extensions.

    Note:
        There are many parameters in function, but all models use just a few of them.
        Usually default parameters are just enough.

        Some of models are regressors and some are classifiers. If it's classifier, it's optimal
        to have data sorted in limited number of bins.

    Args:
        data (tuple[np.ndarray, np.ndarray]) - Tuple (X, y) of input train vectors X and train outputs y.
            Insert input with no constant column - added by default in sklearn.
            Check `mydatapreprocessing` how to generate output.
        model ((str, object), optional): Model that will be used. You can insert model itself or
            just a name of used class. All possible options below in docs. Defaults to 'BayesianRidge'.
        n_estimators (100, optional):  Parameter of some model. Defaults to 100.
        alpha (float, optional): Parameter of some model. Defaults to 0.0001.
        alpha_1 (float, optional): Parameter of some model. Defaults to 1.e-6.
        alpha_2 (float, optional): Parameter of some model. Defaults to 1.e-6.
        lambda_1 (float, optional): Parameter of some model. Defaults to 1.e-6.
        lambda_2 (float, optional): Parameter of some model. Defaults to 1.e-6.
        n_iter (int, optional): Parameter of some model. Defaults to 300.
        epsilon (float, optional): Parameter of some model. Defaults to 1.35.
        alphas (list, optional): Parameter of some model. Defaults to [0.1, 0.5, 1].
        gcv_mode (str, optional): Parameter of some model. Defaults to 'auto'.
        solver (str, optional): Parameter of some model. Defaults to 'auto'.
        n_hidden (int, optional): Parameter of some model. Defaults to 20.
        rbf_width (int, optional): Parameter of some model. Defaults to 0.
        activation_func (str, optional): Parameter of some model. Defaults to 'selu'.

    Returns:
        np.ndarray: Predictions of input time series.

    Options if string::

        ['PLSRegression', 'RandomForestRegressor', 'ExtraTreesRegressor', 'BaggingRegressor',
        'GradientBoostingRegressor', 'AdaBoostRegressor', 'VotingRegressor', 'StackingRegressor',
        'RandomForestClassifier', 'ExtraTreesClassifier', 'BaggingClassifier', 'GradientBoostingClassifier',
        'AdaBoostClassifier', 'VotingClassifier', 'StackingClassifier', 'GaussianProcessRegressor',
        'GaussianProcessClassifier', 'IsotonicRegression', Regression', 'HuberRegressor', 'LinearRegression',
        'LogisticRegression', 'LogisticRegressionCV', 'PassiveAggressiveRegressor', 'SGDRegressor',
        'TheilSenRegressor', 'RANSACRegressor', 'PoissonRegressor', 'GammaRegressor', 'TweedieRegressor',
        'PassiveAggressiveClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'OneVsRestClassifier',
        'OneVsOneClassifier', 'OutputCodeClassifier', 'MultiOutputRegressor', 'RegressorChain',
        'MultiOutputClassifier', 'ClassifierChain', 'KNeighborsRegressor', 'RadiusNeighborsRegressor',
        'KNeighborsClassifier', 'RadiusNeighborsClassifier', 'MLPRegressor', 'MLPClassifier',
        'SelfTrainingClassifier', 'DecisionTreeRegressor', 'ExtraTreeRegressor', 'DecisionTreeClassifier',
        'ExtraTreeClassifier', 'TransformedTargetRegressor', 'BayesianRidge', 'ElasticNet', 'Hinge', 'Lars', 'LarsCV',
        'Lasso', 'LassoCV', 'LassoLarsIC', 'Log', 'ModifiedHuber', 'MultiTaskElasticNet', 'MultiTaskLasso',
        'MultiTaskLassoCV', 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', 'Perceptron', 'Ridge',
        'RidgeCV', 'SquaredLoss', 'SVR',
        # Sklearn extensions
        'ELMClassifier', 'ELMRegressor', 'GenELMClassifier', 'GenELMRegressor']
    """
    from sklearn import (
        multioutput,
        linear_model,
        ensemble,
        tree,
        neighbors,
        gaussian_process,
    )

    X, y = get_inputs(data)

    # If string like 'LinearRegression', find class with such a name
    if isinstance(model, str):

        for i in [linear_model, ensemble, tree, neighbors, gaussian_process]:
            if model in i.__all__:
                model = getattr(i, model)
                break

        # If model is still string, not object from sklearn, it means it was not found,
        # may be from sklearnextensions library
        if isinstance(model, str):

            import sklearn_extensions.extreme_learning_machines.elm as elm

            model = getattr(elm, model)

            # Model defined by string not found
            if isinstance(model, str):

                raise AttributeError(
                    mylogging.return_str(
                        "You defined model that was not found in sklearn. You can use not only string, but also"
                        "object or class itself. You can use function `get_all_models` to get list of all"
                        "possible models and then use one of them."))

    # If class, but no object was configured, create instance
    if callable(model):
        model = model()

    params = {
        "n_estimators": n_estimators,
        "alpha": alpha,
        "alpha_1": alpha_1,
        "alpha_2": alpha_2,
        "lambda_1": lambda_1,
        "lambda_2": lambda_2,
        "n_iter": n_iter,
        "epsilon": epsilon,
        "alphas": alphas,
        "gcv_mode": gcv_mode,
        "solver": solver,
        "n_hidden": n_hidden,
        "rbf_width": rbf_width,
        "activation_func": activation_func,
    }

    # Params, that are configured in function params as well as configurable in models
    used_params = {
        i: j
        for (i, j) in params.items() if i in model.get_params()
    }

    model.set_params(**used_params)

    if y.shape[1] == 1:
        model.output_shape = "one_step"
        setattr(model, "output_shape", "one_step")

        y = y.ravel()

    else:
        if model._estimator_type == "regressor":
            model = multioutput.MultiOutputRegressor(model)
        elif model._estimator_type == "classifier":
            model = multioutput.MultiOutputClassifier(model)

        setattr(model, "output_shape", "multi_step")

    model.fit(X, y)

    return model
示例#24
0
stocks_lag = np.vstack(stocks_lag5)
stocks_lag_label = np.vstack(stocks_lag5_label).astype(np.float64)


stocks_lag5_train = stocks_lag[:-20]
stocks_lag5_label_train = stocks_lag_label[:-20]

test = stocks_lag[-20:]
test_label = stocks_lag_label[-20:]


import xgboost as xgb
import sklearn.multioutput as sk
basemodel = xgb.XGBRegressor(n_estimators=20, max_depth=3, learning_rate=0.1)
model_xgb = sk.MultiOutputRegressor(basemodel)

m = model_xgb.fit(stocks_lag5_train, stocks_lag5_label_train)
p = m.predict(stocks_lag5_train)
p_around  = np.around(p) 
acc = np.mean(np.equal(p_around, stocks_lag5_label_train))
mse = ((p_around - stocks_lag5_label_train)**2).mean()

p_test = m.predict(test)
p_around_test  = np.around(p_test) 
acc_test = np.mean(np.equal(p_around_test, test_label))
mse_test = ((p_around_test - test_label)**2).mean()
    
    
plt.plot(test_label[:,0])
plt.plot(p_test[:,0])
 def model(self, **kwargs):
     return multioutput.MultiOutputRegressor(
         linear_model.LinearRegression())
示例#26
0
from preprocessing import PreProccessor as pp
from preprocessing import metrics_measurer as mm

df = pd.read_csv(config.DATA['output_path'] +
                 '/br/audio-economic-features.csv',
                 index_col=[0])
df.dropna(inplace=True)

classes = df[[
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness',
    'tempo', 'time_signature', 'valence'
]]

pre = pp.PreProccessor(df, classes, 'regression')
pre.cutFeatures([
    'id', 'date', 'duration_ms', 'acousticness', 'danceability', 'duration_ms',
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
    'speechiness', 'tempo', 'time_signature', 'valence'
])

pipe = pline.Pipeline([
    ('scaler', skpre.MinMaxScaler()),
    ('clf', mo.MultiOutputRegressor(svm.SVR(kernel='rbf', C=0.1, gamma=10)))
])

measurer = mm.Measures(df, classes, 10)

print("{} - R^2 Index - \n {}".format(datetime.datetime.now(),
                                      measurer.r2(pipe)))
measurer.regressionReport(pipe)