def test_lin_reg_sklearn_coherence(): """Checks that the sklearn and creme implementations produce the same results.""" class SquaredLoss: """sklearn removes the leading 2 from the gradient of the squared loss.""" def gradient(self, y_true, y_pred): return y_pred - y_true ss = preprocessing.StandardScaler() cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss()) sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0) for x, y in datasets.TrumpApproval(): x = ss.fit_one(x).transform_one(x) cr.fit_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def lgdModel(server=1, intercept=None, coef=None): """ Iterate a generalized linear model :param server: the id of the server :type server: integer :param intercept: an intercept parameter :type intercept: float :param coef: a coefficient :type coef: float """ clf = linear_model.SGDRegressor(tol=None, max_iter=1, verbose=0, warm_start=False, early_stopping=False) # The server ID n = server # Load data from local storage # TODO remove hardwiring Data_Location = './server_dirs/' + str(n) + '/' df = pd.read_csv(Data_Location + 'regression_data.csv') # Extract explanatory and target variables X = df[['X']] y = df['Y'] # Estimate model (initial or update mode) if intercept is None or coef is None: clf.fit(X, y) else: clf.fit(X, y, intercept_init=intercept, coef_init=coef) # Return the current parameter estimates fitted_params = { 'intercept': clf.intercept_[0], 'coefficient': clf.coef_[0] } return fitted_params
def __initialize_model(model_name, lamda=0, hyper_parameters={}): """ initialize machine learning model. Args: model_name: learning algorithm name lamda: coefficient of standardization item hyper_parameter: other parameters for algorithms See parameters for RandomForest Regression in sci-kit-learn Returns: an initialized classifier """ if model_name == constants.MODEL_NAME_LASSO: # note: alpha in scikit-learn reprsents lamda which is the constant that # multiplies the regularization term clf_lasso = linear_model.Lasso(alpha=lamda) return clf_lasso elif model_name == constants.MODEL_NAME_ELASTICNET: clf_elasticnet = ElasticNet(alpha=lamda) return clf_elasticnet elif model_name == constants.MODEL_NAME_RIDGE: clf_ridge = linear_model.Ridge(alpha=lamda) return clf_ridge elif model_name == constants.MODEL_NAME_RIDGECV: clf_ridgecv = linear_model.RidgeCV(alphas=constants.lamdaArray) return clf_ridgecv elif model_name == constants.MODEL_NAME_LARS: clf_lars = linear_model.Lars(n_nonzero_coefs=1) return clf_lars elif model_name == constants.MODEL_NAME_BAYESIAN: clf_bayesian = linear_model.BayesianRidge() return clf_bayesian elif model_name == constants.MODEL_NAME_SGD: clf_sgd = linear_model.SGDRegressor(alpha=lamda) return clf_sgd elif model_name == constants.MODEL_NAME_RANDOM_FOREST: clf_random_forest = RandomForestRegressor(**hyper_parameters, random_state=0, n_jobs=-1) return clf_random_forest
def main(): # Read the data from the train.csv file (into a pandas data object) data = pd.read_csv('./data/train.csv') yData = data["y"] xData = data.drop(['Id', 'y'], 1) # Transform the data according to the model print(f'The input data looks like: {data.head()}\n') headers = xData.columns xData[['x6', 'x7', 'x8', 'x9', 'x10']] = data[headers].applymap(lambda x: x * x) xData[['x11', 'x12', 'x13', 'x14', 'x15']] = data[headers].applymap(math.exp) xData[['x16', 'x17', 'x18', 'x19', 'x20']] = data[headers].applymap(math.cos) xData['x21'] = 1 print(f'The feature transformed data looks like: {xData.head()}') clf = linear_model.SGDRegressor() clf.fit(xData.to_numpy(), yData.to_numpy()) predict = clf.predict(xData) print(f'Linear Coefficients: {clf.coef_}\n') print(f'Mean squared error: {mean_squared_error(yData, predict)}\n') print( f'Root mean squared error: {math.sqrt(mean_squared_error(yData, predict))}\n' ) # Write the CSV result pd.DataFrame(clf.coef_).to_csv('./data/result.csv', header=False, index=False) return
def models_evaluation(self): classifiers = [ # Allows for easy selection for SMVI testing svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] prediction_length = 10000 trainingData_stock, trainingScores_stock, predictionData_stock = self.get_model_data( prediction_length, self.joint_data_frame['# of Tweets'].tolist(), self.joint_data_frame['Stock Volume'].tolist()) trainingData_base, trainingScores_base, predictionData_base = self.get_model_data( prediction_length, self.joint_data_frame['# of Tweets'].tolist(), self.joint_data_frame['Base Volume'].tolist()) predicted_stock = classifiers[2].fit( trainingData_stock, trainingScores_stock).predict(predictionData_stock) predicted_base = classifiers[2].fit( trainingData_base, trainingScores_base).predict(predictionData_base) Stock_SVMI = (sum(predicted_stock) / prediction_length) / len(trainingData_stock) Base_SMVI = (sum(predicted_base) / prediction_length) / len(trainingData_base) os.system('clear') print('Stock SMVI: ', Stock_SVMI) print('Base SMVI: ', Base_SMVI) self.SMVI = abs( abs(Stock_SVMI) - abs(Base_SMVI) ) # Using the difference between the SMVI for the stock and the base allows us to remove the possibility of a market crash print('Real SMVI (Unscaled): ', self.SMVI)
def __init__(self, df, run_prefix, max_iter, cv_count): self.run_prefix = run_prefix self.max_iter = max_iter self.cv_count = cv_count self.y_tune = df.PHENO self.X_tune = df.drop(columns=['PHENO']) self.IDs_tune = self.X_tune.ID self.X_tune = self.X_tune.drop(columns=['ID']) best_algo_name_in = run_prefix + '.best_algorithm.txt' best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False) self.best_algo = str(best_algo_df.iloc[0,0]) self.algorithms = [ linear_model.LinearRegression(), ensemble.RandomForestRegressor(), ensemble.AdaBoostRegressor(), ensemble.GradientBoostingRegressor(), linear_model.SGDRegressor(), svm.SVR(), neural_network.MLPRegressor(), neighbors.KNeighborsRegressor(), ensemble.BaggingRegressor(), xgboost.XGBRegressor() ] # Initialize a few variables we will be using later self.log_table = None self.best_algo_name_in = None self.best_algo_df = None self.hyperparameters = None self.scoring_metric = None self.cv_tuned = None self.cv_baseline = None self.algo = None self.searchCVResults = None self.rand_search = None self.algo_tuned = None self.tune_out = None
def execute(self): # create model model = linear_model.SGDRegressor() # recursively eliminate features rfecv = RFECV(estimator=model, step=1, scoring="neg_mean_squared_error") rfecv.fit(self.partitions.x_train, self.partitions.y_train) rfecv.transform(self.partitions.x_train) # number of best features self.n_features = rfecv.n_features_ # which categories are best self.best_features = rfecv.support_ # rank features best (1) to worst self.feature_ranking = rfecv.ranking_ return self.n_features, self.best_features, self.feature_ranking
def UnivariateStochasticTool(): regressor = linear_model.SGDRegressor(alpha=0.01, max_iter=1000) xx = [[el] for el in trainGdp] regressor.partial_fit(xx, trainOutputs) w0, w1 = regressor.intercept_[0], regressor.coef_[0] w = [w0, w1] print("-----with tool-----") print("Regression for attribute: GDP") print("\tThe learnt model: f(X,w) = " + str(w0) + " + " + str(w1) + " * X") computedOutputs = regressor.predict([[x] for x in testGdp]) print("\tPrediction error (tool): ", str(mean_squared_error(testOutputs, computedOutputs))) print("\tPrediction error (manual): ", str(meanSquareError(testOutputs, computedOutputs))) plotDataForUni(gdpData, outputs, w, "Train & test data") plotDataForUni(trainGdp, trainOutputs, w, "Train data and the learnt model") plotData2ForUni(testGdp, testOutputs, computedOutputs, "Computed vs real test data")
def choose_model(self, X, y): """ Automatic model chooser. :param X: data :param y: target :type X: ndarray or scipy.sparse matrix, (n_samples, n_features) :type y: ndarray, shape (n_samples,) or (n_samples, n_targets) """ #{'linear', 'polynomial',logistic','logisticcv','elasticnet','elasticnetcv','orthogonal','orthogonalcv','theil','sgd','perceptron','passive_aggressive'} models = { 'linear': linear_model.LinearRegression(), 'logistic': linear_model.LogisticRegression(), 'elasticnet': linear_model.ElasticNet(), 'orthogonal': linear_model.OrthogonalMatchingPursuit(), 'theil': linear_model.TheilSenRegressor(), 'sgd': linear_model.SGDRegressor(), 'passive_agressive': linear_model.PassiveAggressiveRegressor() } scores = {} for name, model in models.items(): scores[name] = [] sss = StratifiedShuffleSplit(10, 0.25) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for name, model in models: mode.fit(X_train, y_train) scores[name].append(metrics.mean_squared_error(X_test, y_test)) #Choose http://blog.minitab.com/blog/adventures-in-statistics-2/how-to-choose-the-best-regression-model index = None for name, model in models: min = 10000 if scores[name][-1] < min: min = scores[name][-1] self._model = model
def SGD(): train_X, train_y, test_X, test_y, nonescaled_y = pre_process() clf = linear_model.SGDRegressor() for i in range(len(train_X)): X, y = train_X[i:i + 1], train_y[i:i + 1] clf.partial_fit(X, y) predsgdr = clf.predict(test_X) pred_vals = [ (pred * (config.column1_max - config.column1_min)) + config.column1_min for pred in predsgdr ] pred_vals = np.asarray(pred_vals) get_scores("---------SGDRegressor----------", pred_vals, nonescaled_y) plot(nonescaled_y, pred_vals, "SGDRegressor Prediction Vs Truth.png")
def fit_model(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0) # TODO: Create a decision tree regressor object regressor1 = DecisionTreeRegressor() regressor2 = linear_model.SGDRegressor() #regressor3 = SVC() # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 tree_params = {'max_depth' : [3, 6, 9, 20, 100], 'min_samples_split':[2, 3, 4, 5]} sgd_params = {'loss':['squared_loss', 'huber'], 'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'n_iter':[10, 75, 100, 500]} #svm_params = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},{'kernel': ['linear'], 'C': [1, 10, 100, 1000]} # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) # Updated cv_sets and scoring parameter grid = GridSearchCV(regressor1, tree_params, scoring = scoring_fnc, cv = cv_sets) # Fit the grid search object to the data to compute the optimal model #print("grid fit") grid = grid.fit(X, y) # Updated cv_sets and scoring parameter #grid = GridSearchCV(regressor2, sgd_params, scoring = scoring_fnc, cv = cv_sets) # Fit the grid search object to the data to compute the optimal model #grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def load_default(self, machine_list='basic'): """ Loads 4 different scikit-learn regressors by default. The advanced list adds more machines. Parameters ---------- machine_list: optional, list of strings List of default machine names to be loaded. Returns ------- self : returns an instance of self. """ if machine_list == 'basic': machine_list = ['tree', 'ridge', 'random_forest', 'svm'] if machine_list == 'advanced': machine_list=['lasso', 'tree', 'ridge', 'random_forest', 'svm', 'bayesian_ridge', 'sgd'] self.estimators_ = {} for machine in machine_list: try: if machine == 'lasso': self.estimators_['lasso'] = linear_model.LassoCV(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'tree': self.estimators_['tree'] = DecisionTreeRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'ridge': self.estimators_['ridge'] = linear_model.RidgeCV().fit(self.X_k_, self.y_k_) if machine == 'random_forest': self.estimators_['random_forest'] = RandomForestRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'svm': self.estimators_['svm'] = LinearSVR(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'sgd': self.estimators_['sgd'] = linear_model.SGDRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'bayesian_ridge': self.estimators_['bayesian_ridge'] = linear_model.BayesianRidge().fit(self.X_k_, self.y_k_) except ValueError: continue return self
def get_model(model_type, c=0, epsilon=0, gamma=0): if model_type == RBF: model = model = svm.SVR(kernel='rbf', C=c, epsilon=epsilon, gamma=gamma) elif model_type == POLY2: model = svm.SVR(kernel='poly', C=c, degree=2, epsilon=epsilon) elif model_type == POLY3: model = svm.SVR(kernel='poly', C=c, degree=3, epsilon=epsilon) elif model_type == POLY4: model = svm.SVR(kernel='poly', C=c, degree=4, epsilon=epsilon) elif model_type == LIN: model = svm.SVR(kernel='linear', C=c, epsilon=epsilon) elif model_type == Rand_F: model = ensemble.RandomForestRegressor() elif model_type == SGD: model = linear_model.SGDRegressor() elif model_type == KRR: model = kernel_ridge.KernelRidge(kernel='linear', alpha=1/(2*c)) elif model_type == DT: model = DecisionTreeRegressor() else: raise(ValueError('unknown model type: ' + str(model_type))) return model
def compute_params_SGDR(diamonds, prices, validation, validation_prices, _it, _lr): np_X = numpy.array(diamonds, dtype=float) np_X_validation = numpy.array(validation, dtype=float) np_Y = prices np_Y_validation = validation_prices np_Y.transpose() np_Y_validation.transpose() regr = linear_model.SGDRegressor(max_iter=_it, eta0=_lr) regr.fit(np_X, np_Y) diamonds_y_pred = regr.predict(np_X_validation) print('Coefficients: \n', regr.coef_) print('Intercept: \n', regr.intercept_) print("Mean squared error: %.2f" % mean_squared_error(np_Y_validation, diamonds_y_pred)) print("R2 Score: %.2f" % r2_score(np_Y_validation, diamonds_y_pred)) return regr
def run_sgdreg(down_station, input_list, include_time, sample_size, network_type, _tol, _eta0): start_time_run = time.time() result_dir = util.get_result_dir(down_station, network_type, _tol, _eta0, sample_size) if not os.path.exists(result_dir): os.makedirs(result_dir) (y_train, x_train, y_cv, x_cv, _, _, _, _, train_y_max, train_y_min, _, _, _, _, _) = data.construct(down_station, input_list, include_time, sample_size, network_type) sgdreg = linear_model.SGDRegressor(max_iter=100000, tol=_tol, eta0=_eta0) sgdreg.fit(x_train, y_train) y_pred = sgdreg.predict(x_cv) predict.plot_prediction(y_pred, result_dir, y_cv, train_y_max, train_y_min) elapsed_time_run = time.time() - start_time_run print( time.strftime("Fitting time : %H:%M:%S", time.gmtime(elapsed_time_run)))
def BivariateStochasticTool(): regressor = linear_model.SGDRegressor(alpha=0.01, max_iter=1000) regressor.fit(trainInputs, trainOutputs) w0, w1, w2 = regressor.intercept_[0], regressor.coef_[0], regressor.coef_[ 1] w = [w0, w1, w2] print("-----with tool-----") print("Regression for attributes: GDP & Freedom") print("\tThe learnt model: f(X,w) = " + str(w0) + " + " + str(w1) + " * X1 + " + str(w2) + " * X2") computedTestOutputs = regressor.predict(testInputs) print("\tPrediction error (tool): ", str(mean_squared_error(testOutputs, computedTestOutputs))) print("\tPrediction error (manual): ", str(meanSquareError(testOutputs, computedTestOutputs))) plotDataForBi(gdpData, freedomData, outputs, w, "Train & test data") plotDataForBi(trainGdp, trainFreedom, trainOutputs, w, "Train data and the learnt model") plotData2ForBi(testGdp, testFreedom, testOutputs, computedTestOutputs, "Computed(green) vs real(red) test data")
def _fit_regression(self, dataset, target, level=None, features=None): """Fits a regression -- to be implemented by subclasses. This method updates self.model[target] with the trained model and does not return anything. Args: dataset: src.data.dataset.Dataset, the data which is to be used for fitting. target: string, the name of the target variable. level: string, the target's sub-class. If this isn't specified, the system will assume that the target is monolithic. features: list(string), a subset of dataset.vocab which is to be used while fitting. Returns: regression_base.ModelResult, the fitted parameters. """ iterator = self._iter_minibatches(dataset=dataset, target_name=target['name'], features=features, batch_size=self.params['batch_size'], level=level) print('REGRESSION: fitting target %s', target['name']) model = linear_model.SGDRegressor(penalty=self.regularizer or 'none', alpha=self.lmbda, learning_rate='constant', eta0=self.params.get('lr', 0.001)) for _ in tqdm(range(self.params['num_train_steps'])): xi, yi, x_features = next(iterator) model.partial_fit(xi, yi) return ModelResult(model=model, weights=self._sklearn_weights(model, x_features), response_type='continuous')
def __init__(self, env, use_kernel=False, **agent_params): self.env = env self.use_kernel = use_kernel if use_kernel: # Sample feature space and define scaler to detrend data observation_samples = np.array( [env.observation_space.sample() for x in range(10000)]) self.detrend = preprocessing.StandardScaler() self.detrend.fit(observation_samples) # Use detrended data to generate feature space with RBF kernels self.featurizer = pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=3.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) self.featurizer.fit(self.detrend.transform(observation_samples)) # Generate linear value function model for each action in our action space self.models = [] initReward = np.array(0) for k in range(env.action_space.n): self.models.append( linear_model.SGDRegressor(learning_rate="constant")) random_features = self.map_to_features(self.env.reset()) self.models[k].partial_fit(random_features.reshape(1, -1), initReward.ravel()) self.agent_params = { "epsilon_min": 0.01, "decay_rate": 0.02, "discount": 0.99, "iter": 1000 } self.agent_params.update(agent_params)
def testing_using_crossvalidation_regression(df, label, features, alpha, l1_ratio, penalty, loss, epsilon, label_std): """Fit a model, then test it using 5-fold crossvalidation Parameters ---------- df : pandas.DataFrame pandas dataframe of features and labels features : list of strings list of feature labels to use in model training alpha : float weighting of the regularization term l1_ratio : float the Elastic Net mixing parameter, 0 <= l1_ratio <= 1 l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1 penalty : string penalty specification, 'none', 'l2', 'l1', or 'elasticnet' Returns ------- float average crossvalidation score (accuracy) """ reg = linear_model.SGDRegressor(alpha=alpha, loss=loss, penalty=penalty, l1_ratio=l1_ratio, epsilon=epsilon, max_iter=1000) scores = model_selection.cross_val_score(reg, df[features], df[label], cv=5, scoring='neg_mean_absolute_error') return -1.0 * scores.mean() / label_std
def train(training_data): training_data_cutoff = int(floor(len(training_data) * .7)) random.shuffle(training_data) x_data = [] y_data = [] for i, sample in enumerate(training_data): x_data.append(sample["amplitudes"]) y_data.append(sample["rating"]) reg = linear_model.SGDRegressor() reg.fit(x_data[:training_data_cutoff], y_data[:training_data_cutoff]) predicted = reg.predict(x_data[training_data_cutoff:]) actual = y_data[training_data_cutoff:] return Bunch({ "predicted": predicted, "actual": actual, "x_data_test": x_data[training_data_cutoff:], "y_data_test": y_data[training_data_cutoff:], "reg": reg })
def find_one_LRSGD_parameter_all_CV(data, columns_used, output, grid_size, feature_elim=False): SGD_model_eval = [] print("Feature ELim: ", feature_elim) if feature_elim == False: k = 'all' for penalty,alpha,lr,eta0 in itertools.product(['l1','l2','elasticnet'],[0.1,0.001,0.01,0.0001],\ ['constant','optimal'],[1,0.1,0.001,0.01,0.0001]): model = linear_model.SGDRegressor(penalty=penalty, alpha=alpha, learning_rate=lr, \ eta0=eta0, random_state=4, shuffle=False) #now run it for all CV and find average error error = run_validation_CV(data, columns_used, output, model) SGD_model_eval.append([penalty, alpha, lr, eta0, k, error]) SGD_model_eval = pd.DataFrame(SGD_model_eval,\ columns=['penalty','alpha','lr','eta0','k','RMSE']).groupby(by=['penalty','alpha','lr','eta0','k']).sum() print(SGD_model_eval.RMSE.argmin()) SGD_model_eval.to_csv("../results/SGD/%s/%s_param_CV_error.csv" % (grid_size, output)) return SGD_model_eval.RMSE.argmin()
def detect(self, results, job=None, type='alert'): logger = logging.getLogger(__name__) logger.debug(results) name = job['name'] logger.info('Processing results for: %s' % (name)) df = pd.read_csv(job['data']) n = len(df) sdg = linear_model.SGDRegressor() mapper = DataFrameMapper([('value', None)]) df_train = df[:n / 2]['value'].as_matrix() df_test = df[n / 2:]['value'].as_matrix() logger.info([df_train]) # ft_df = fft(df['value'].as_matrix()) # ft_x = fftfreq(len(df['value'])) df.plot(title=name) # r = sdg.fit(df_train['value'].as_matrix(),df_test['value'].as_matrix()) # logger.info(r) self.decompose(df['value']) plt.show() logger.info(df)
def trainRegressionModel(self, training_dataset): # Create matrix: features = self.fe.calculateFeatures(training_dataset, input='file') Xtr = [] Ytr = [] f = open(training_dataset) c = -1 for line in f: data = line.strip().split('\t') cands = [cand.strip().split(':')[1] for cand in data[3:]] indexes = [int(cand.strip().split(':')[0]) for cand in data[3:]] featmap = {} for cand in cands: c += 1 featmap[cand] = features[c] for i in range(0, len(cands) - 1): for j in range(i + 1, len(cands)): indexi = indexes[i] indexj = indexes[j] indexdiffji = indexj - indexi indexdiffij = indexi - indexj positive = featmap[cands[i]] negative = featmap[cands[j]] v1 = np.concatenate((positive, negative)) v2 = np.concatenate((negative, positive)) Xtr.append(v1) Xtr.append(v2) Ytr.append(indexdiffji) Ytr.append(indexdiffij) f.close() Xtr = np.array(Xtr) Ytr = np.array(Ytr) model = linear_model.SGDRegressor() model.fit(Xtr, Ytr) return model
def __init__(self, history_length, prediction_horizon, difference_learning, averaging, streaming, settings=None): super().__init__(history_length, prediction_horizon, difference_learning, averaging=averaging, streaming=streaming) eta0 = 0.0001 epochs = 1 if settings: eta0 = settings['eta0'] epochs = settings.get('epochs', 1) self.models_ = [] for i in range(self.observation_dimension): self.models_.append( linear_model.SGDRegressor(verbose=False, learning_rate='constant', eta0=eta0)) self.epochs_ = epochs
def regr(df, mod, modScale, sgdScale=1, ForMod=1): hold = 0 sgd = linear_model.SGDRegressor(max_iter=1000, alpha=0.0001, penalty='elasticnet') if sgdScale == 1: scaler = StandardScaler() normalized = scaler.fit_transform(df.iloc[:, 1:9]) xTs = pd.DataFrame(normalized) sgd.fit(pd.DataFrame(normalized), df.iloc[:, 0]) if ForMod == 2: predDat, uniPred, arimaPred = arimaRNN(df, mod) if ForMod == 1: predDat, uniPred = rscript(df) else: predDat, uniPred, mod = esRNN(df, mod, modScale=0) if len(predDat) == 1: norm = scaler.fit_transform(np.array(predDat).reshape(-1, 1)) else: norm = scaler.fit_transform(np.array(predDat).reshape(1, -1)) print(norm) multiPred = sgd.predict(norm) else: sgd.fit(df.iloc[:, 1:9], df.iloc[:, 0]) if ForMod == 2: predDat, uniPred, arimaPred = arimaRNN(df, mod) hold = arimaPred if ForMod == 1: predDat, uniPred = rscript(df) else: predDat, uniPred, mod = esRNN(df, mod, modScale=0) multiPred = sgd.predict([np.asarray(predDat)]) naive = df.iloc[:, 0].mean() m = df.iloc[:, 0].rolling(2).mean() ma = m[m.shape[0] - 1] return uniPred, multiPred.item(0), naive, ma, mod, hold
def get_stats(path): info = pd.read_csv(path) info = info.dropna() f = info['price'] < 100000 info = info[f] # Get information only about flats with price < 100'000 X = info[['type', 'size', 'locality']].values scaler_X = preprocessing.StandardScaler().fit(X) X = scaler_X.transform(X) y = info['price'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20) estimators = [ linear_model.LinearRegression(), linear_model.Ridge(alpha=0.1), linear_model.Lasso(alpha=0.1), linear_model.ElasticNet(alpha=0.01, l1_ratio=0.25), linear_model.BayesianRidge(n_iter=500), linear_model.OrthogonalMatchingPursuit(), linear_model.SGDRegressor(max_iter=2500, epsilon=0.01), SVR(kernel='rbf', epsilon=0.01, C=20) ] estimator_values = np.array([]) for e in estimators: e.fit(X_train, y_train) this_err = metrics.median_absolute_error(y_test, e.predict(X_test)) estimator_values = np.append(estimator_values, this_err) return estimator_values
zero_count = ratings[-1].count(0) zero_feat_count = features[-1].count(0) if (zero_count > 4 or zero_feat_count > 4): ratings.pop() features.pop() features = np.array(features) ratings = np.array(ratings) not_feature_index = [4, 10] features = np.delete(features, not_feature_index, axis=1) features = preprocessing.scale(features) for i in range(22): min_error = np.inf best_alpha = -1 al = 0.01 for c in range(20): clf = linear_model.SGDRegressor(penalty='l1', alpha=al, n_iter=100) al = 0.01 * c ans = cross_val_predict(clf, features, ratings[:, i], cv=5) if top_row[ratings_start_point + i] == 'cspostur' or top_row[ratings_start_point + i] == 'cseyecon': ans = cross_val_predict(clf, features[12:, :], ratings[12:, i], cv=5) if (min_error > mean_squared_error(ratings[12:, i], ans)): min_error = mean_squared_error(ratings[12:, i], ans) best_clf = clf.fit(features[12:, :], ratings[12:, i]) best_alpha = al ssreg = np.sum((ans - np.mean(ratings[12:, i]))**2) sstot = np.sum((ratings[12:, i] - np.mean(ratings[12:, i]))**2)
trainOutputs, testOutputs = statisticalNormalisation(trainO, testO) #tool data normalisation # toolTrainInputs=tool_normalisation(trainI) myGD = GD(len(trainInputs[0])) myGD.train(trainInputs, trainOutputs) # myGD.train(trainI, trainO) model = "The MANUAL BATCH learnt model: " + str(myGD.intercept) for i in range(len(myGD.coef)): model += " + " + str(myGD.coef[i]) + " * x" + str(i + 1) print(model) computedTestOutputs = myGD.predict(testInputs) err = myGD.eroare(computedTestOutputs, testOutputs) #tool toolRegressor = linear_model.SGDRegressor(alpha=0.01) for ep in range(1000): toolRegressor.partial_fit(trainInputs, trainOutputs) model = "The TOOL learnt model: " + str(toolRegressor.intercept_[0]) for i in range(len(toolRegressor.coef_)): model += " + " + str(toolRegressor.coef_[i]) + " * x" + str(i + 1) print(model) toolComputed = toolRegressor.predict(testInputs) print("Eroare tool regresor:" + str(mean_squared_error(toolComputed, testOutputs))) print("Eroare tool pentru regresorul meu:" + str(mean_squared_error(computedTestOutputs, testOutputs))) print("Eroare fara tool:" + str(err))
print(y_score) # Predict on the test data: y_pred y_pred = svr.predict(X_test) # Compute and print R^2 and RMSE print("R^2: {}".format(svr.score(X_test, y_test))) rmse = np.sqrt(mean_squared_error(y_test , y_pred)) print("Root Mean Squared Error: {}".format(rmse)) ########################################## # SGD from sklearn import linear_model clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3) clf.fit(X_train, y_train) # Calling the score method, which compares the predicted values to the actual values y_score = clf.score(X_test, y_test) # The score is directly comparable to R-Square print(y_score) ################################## # comparing results to evaluate model
#See Ridge Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge lr = linear_model.Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None) #See SGD Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor sgd = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)