示例#1
32
def svr_main(X, Y):
    X_train = X[:TRAIN_SIZE]
    Y_train = Y[:TRAIN_SIZE]
    X_test = X[TRAIN_SIZE:]
    Y_test = Y[TRAIN_SIZE:]

    clf = SVR(kernel='rbf', C=1e3, gamma=0.00001)
    #clf.fit(X_train,Y_train)
    #y_pred = clf.predict(X_test)
    #plt.plot(X_test, y_pred, linestyle='-', color='red') 

    #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1)
    #clf = DecisionTreeRegressor(max_depth=25)
    #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14)
    #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25)
    #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7)
    predict_list = []
    for i in xrange(TEST_SIZE):
        X = [ [x] for x in xrange(i, TRAIN_SIZE+i)]
        clf.fit(X, Y[i:TRAIN_SIZE+i])
        y_pred = clf.predict([TRAIN_SIZE+1+i])
        predict_list.append(y_pred)

    print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)
    print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))
    origin_data = Y_test
    print "origin data:%s"%origin_data
    plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model')  
    plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') 
    plt.legend(loc=1, prop={'size': 12})
    plt.show()
示例#2
0
class SVMLearner(object):

    def __init__(self, kernel="linear", C=1e3, gamma=0.1, degree=2, verbose = False):
		self.name = "{} Support Vector Machine Learner".format(kernel.capitalize())
		self.kernel=kernel
		if kernel=="linear":
			self.svr = SVR(kernel=kernel, C=C)
		elif kernel=="rbf":
			self.svr = SVR(kernel=kernel, C=C, gamma=gamma)
		elif kernel=="poly":
			self.svr = SVR(kernel=kernel, C=C, degree=degree)

    def addEvidence(self,dataX,dataY):
        """
        @summary: Add training data to learner
        @param dataX: X values of data to add
        @param dataY: the Y training values
        """
        # build and save the model
        self.svr.fit(dataX, dataY)
        
    def query(self,points):
        """
        @summary: Estimate a set of test points given the model we built.
        @param points: should be a numpy array with each row corresponding to a specific query.
        @returns the estimated values according to the saved model.
        """
        return self.svr.predict(points)
示例#3
0
def test_check_is_fitted():
    # Check is ValueError raised when non estimator instance passed
    assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_")
    assert_raises(TypeError, check_is_fitted, "SVR", "support_")

    ard = ARDRegression()
    svr = SVR()

    try:
        assert_raises(NotFittedError, check_is_fitted, ard, "coef_")
        assert_raises(NotFittedError, check_is_fitted, svr, "support_")
    except ValueError:
        assert False, "check_is_fitted failed with ValueError"

    # NotFittedError is a subclass of both ValueError and AttributeError
    try:
        check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s")
    except ValueError as e:
        assert_equal(str(e), "Random message ARDRegression, ARDRegression")

    try:
        check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s")
    except AttributeError as e:
        assert_equal(str(e), "Another message SVR, SVR")

    ard.fit(*make_blobs())
    svr.fit(*make_blobs())

    assert_equal(None, check_is_fitted(ard, "coef_"))
    assert_equal(None, check_is_fitted(svr, "support_"))
示例#4
0
文件: hotTweets.py 项目: makseq/360
class HotTweets:
	''' Train and get tweet hotness '''

	def __init__(self, kernel='rbf', C=1e3, gamma=0.1, epsilon=0.1, n_comp=100):
		''' Prepare support vector regression ''' 
		self.svr = SVR(kernel=kernel, C=C, gamma=gamma, epsilon=epsilon, verbose=True)
		#self.svr = LogisticRegression(random_state=42, verbose=0)
		self.n_comp = n_comp

	def fit_scaler(self, dev, i_dev):
		''' Train normalizers for features and importances '''
		# importance scaler
		self.std_scaler_i = sklearn.preprocessing.StandardScaler()
		self.std_scaler_i.fit(i_dev)
		self.norm = sklearn.preprocessing.StandardScaler()
		self.norm.fit(dev[:,0:self.n_comp])
		self.n_comp = self.n_comp
	
	def train(self, features, importances):
		''' Train regression '''
		importances = self.std_scaler_i.transform(importances)
		features = self.norm.transform(features[:,0:self.n_comp])
		self.svr.fit(features, importances)
		
		
	def predict(self, features):
		''' Predict importances '''
		features = self.norm.transform(features[:,0:self.n_comp])
		results = self.svr.predict(features)
		#print results[0:100:5]
		results = self.std_scaler_i.inverse_transform(results)
		#print results[0:100:5]
		return results
	def svm(self):
		"""
		C_range = np.logspace(-2, 10, 2)
		print C_range
		gamma_range = np.logspace(-9, 3, 2)
		print gamma_range
		param_grid = dict(gamma=gamma_range, C=C_range)
		cv = ShuffleSplit(len(self.search_inputs.y_train), n_iter=5, test_size=0.2, random_state=42)
		grid = GridSearchCV(SVR(verbose=True), param_grid=param_grid, cv=cv)
		#grid = GridSearchCV(svm.SVR(kernel='rbf', verbose=True), param_grid=param_grid, cv=cv)
		grid.fit(self.search_inputs.X_train, self.search_inputs.y_train)

		print("The best parameters are %s with a score of %0.2f"
			% (grid.best_params_, grid.best_score_))

		self.svm_preds = grid.predict(self.search_inputs.X_test)
		"""

		regression = SVR(kernel='rbf', C=1e3, gamma=0.1, verbose=True)
		regress_fit = regression.fit(self.search_inputs.X_train,self.search_inputs.y_train)
		self.svm_preds = regress_fit.predict(self.search_inputs.X_test)
		
		for i in range(0,len(self.svm_preds) - 1):
			if self.svm_preds[i] < 1:
				self.svm_preds[i] = 1.00
			elif self.svm_preds[i] > 3:
				self.svm_preds[i] = 3.00
		self.search_inputs.fin_df['relevance'] = np.array(self.svm_preds) # easy swap in / out 
		final_file_svm = self.search_inputs.fin_df.to_csv(self.fin_file_name+'_svm.csv', float_format='%.5f', index=False)
示例#6
0
def getError1(signal, normedDay, period, phase):
    '''
    Gets the error for a list of points across a normed day given a sklean 
    model, the period, and the phase of the fitted signal.
    
    Here I'm using the Euclidean distance as the error measurement.  This 
    requires a little more computation due to the need to fit an inverse
    model, but provides better fits.
    
    Returns the squared Euclidean error.
    '''
    
    if rank(normedDay.index[0]) > 0:
        t0= round((array(normedDay.index.get_level_values(0))- phase)%period,3)
    else:
        t0 = round((array(normedDay.index,dtype=float) - phase)%period,3)
    nD = Series(normedDay, index=t0)
    
    tUp = array([arange(0,period+.1,.1)]).T
    invSignal = SVR(kernel='rbf', C=signal.C, gamma=signal.gamma, 
                    epsilon=signal.epsilon)
    
    invSignal.fit(array([signal.predict(tUp)]).T, tUp.flatten())
    
    xDiff = nD - signal.predict(array([array(nD)]).T)
    yDiff = nD - signal.predict(array([nD.index]).T)
    
    error = sum(pow(xDiff/period,2) + pow(yDiff/2,2))
    return error
示例#7
0
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_svr = [{
            'kernel': ['rbf', 'sigmoid', 'linear'],
            'C': [0.01, 0.1, 1, 10, 100],
            'epsilon': [0.0000001, 0.000001, 0.00001]
            }]
        params = ParameterGrid(params_svr)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            # X_train = self.pca.fit_transform(X_train.values)
            X_train = X_train.values
            # pdb.set_trace()
            X_cv, y_cv = stock.get_data(mid_date, end_date)
            # X_cv = self.pca.transform(X_cv.values)
            X_cv = X_cv.values

            lowest_mse = np.inf
            for i, param in enumerate(params):
                svr = SVR(**param)
                # ada = AdaBoostRegressor(svr)
                svr.fit(X_train, y_train.values)
                mse = mean_squared_error(
                    y_cv, svr.predict(X_cv))
                if mse <= lowest_mse:
                    self.models[ticker] = svr

        return self
示例#8
0
文件: svr.py 项目: rcurtin/benchmarks
    def RunSVRScikit():
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      # Use the last row of the training set as the responses.
      X, y = SplitTrainData(self.dataset)

      # Get all the parameters.
      opts = {}
      if "c" in options:
        opts["C"] = float(options.pop("c"))
      if "epsilon" in options:
        opts["epsilon"] = float(options.pop("epsilon"))
      if "gamma" in options:
        opts["gamma"] = float(options.pop("gamma"))
      opts["kernel"] = "rbf"

      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      try:
        with totalTimer:
          # Perform SVR.
          model = SSVR(**opts)
          model.fit(X, y)
      except Exception as e:
        return -1

      return totalTimer.ElapsedTime()
    def train(self, x, y, param_names, random_search=100,
              kernel_cache_size=2000, **kwargs):
        if self._debug:
            print "First training sample\n", x[0]
        start = time.time()
        scaled_x = self._set_and_preprocess(x=x, param_names=param_names)

        # Check that each input is between 0 and 1
        self._check_scaling(scaled_x=scaled_x)

        if self._debug:
            print "Shape of training data: ", scaled_x.shape
            print "Param names: ", self._used_param_names
            print "First training sample\n", scaled_x[0]
            print "Encode: ", self._encode

        # Do a random search
        c, gamma = self._random_search(random_iter=random_search, x=scaled_x,
                                       y=y, kernel_cache_size=kernel_cache_size)

        # Now train model
        try:
            svr = SVR(gamma=gamma, C=c, random_state=self._rng,
                      cache_size=kernel_cache_size)
            svr.fit(scaled_x, y)
            self._model = svr
        except Exception, e:
            print "Training failed", e.message
            svr = None
示例#10
0
def predict_device_byday_SVR():
    X,Y_unique,Y_all,X_raw = load_device_counter_byday()

    from sklearn.svm import SVR
    model = SVR()
    # model = SVR(kernel='linear')
    training_size = 160
    # model.fit(X[:training_size],Y_unique[:training_size])
    model.fit(X[:training_size],Y_all[:training_size])

    start_index = 180
    end_index = 190
    X_to_predict = X[start_index:end_index]
    # X_to_predict.append([date_str_toordinal('2017-04-18')])
    # X_to_predict.append([date_str_toordinal('2017-03-27')])

    print X_to_predict
    # Y_real = Y_unique[start_index:end_index]
    Y_real = Y_all[start_index:end_index]
    print X_raw[start_index:end_index]
    y_predicted=model.predict(X_to_predict)
    # print y_predicted
    y_predicted = np.array(y_predicted).astype(int)
    print y_predicted
    print Y_real
    # print y_predicted - np.array(Y_real)

    # plt.subplot(111)
    # plt.scatter(X_to_predict,Y_real,c='r')
    plt.scatter(X_to_predict,y_predicted)
    # plt.plot(X_to_predict,y_predicted)
    plt.show()
示例#11
0
def main(args):
    (training_file, label_file, test_file, test_label, c, e) = args
    svr = SVR(C=float(c), epsilon=float(e), kernel='rbf')
    X = load_feat(training_file)
    y = [float(line.strip()) for line in open(label_file)]
    
    X = np.asarray(X)
     
    y = np.asarray(y)
    
    test_X = load_feat(test_file)
    test_X = np.asarray(test_X)
    test_X[np.isnan(test_X)] = 0

    svr.fit(X, y)
    
    pred = svr.predict(test_X)
    if test_label != 'none':
        test_y = [float(line.strip()) for line in open(test_label)]
        test_y = np.asarray(test_y)
        print 'MAE: ', mean_absolute_error(test_y, pred)
        print 'RMSE: ', sqrt(mean_squared_error(test_y, pred))
        print 'corrpearson: ', sp.stats.pearsonr(test_y, pred)
        print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2
        print mquantiles(test_y, prob=[0.10, 0.90])
        print mquantiles(pred, prob=[0.10, 0.90])
    with open(test_file + '.svr.pred', 'w') as output:
        for p in pred:
            print >>output, p
    return
def train_learning_model_svm(df):
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    regressor = SVR()
    regressor.fit(X_train, y_train)
    calculate_results(regressor, X_train, X_test, y_train, y_test)
示例#13
0
def getCharacteristicSignal(normedDays, phase, period, plotAxis=False):
    series = pandas.Series()
    for day in normedDays:
        series = series.append(day)
    '''Shift the times to give relative time of day'''
    t0 = array(series.index, dtype=float)
    t0 = (t0 - phase) % period
    t0 = array([t0]).T
    
    '''Shift the array to fit the edges'''
    tExt = array([array([t0-period,t0,t0+period]).flatten()]).T
    seriesExt = numpy.array([array(series),array(series),
                             array(series)]).flatten()
    
    '''Fit the model'''
    svr_rbf = SVR(kernel='rbf', C=1e4, gamma=.03, epsilon=.01)
    y_rbf = svr_rbf.fit(tExt, seriesExt)
    
    '''Predict a new characteristic signal'''
    t1 = array([arange(0,period, period/100.)]).T
    signal = y_rbf.predict(t1)
    
    if plotAxis:
        plotAxis.plot(t1, signal)
        colors = ['b','g','r','c']
        for i,day in enumerate(normedDays):
            timesAdjusted = array(normedDays[i].index,dtype=float)
            timesAdjusted = (timesAdjusted - phase) % period
            plotAxis.plot(timesAdjusted, day, 'o', label=str(i), 
                          color=colors[i])
        plotAxis.set_title('Characteristic Signal')
        plotAxis.legend(loc='best')
        plotAxis.set_xbound(0,period)
        plotAxis.set_ybound(-1.1,1.1)
    return signal
示例#14
0
    def svr(self, X, y):
        """ Train support vector regression model

        Parameters
        ----------
        X : numpy ndarray with numeric values
            Array containing input parameters
            for the model. Model will try to
            learn the output y[i] in terms of
            inputs X[i]

        y : columnar numpy array with numeric values
            Array containing single column of
            output values. Entry at y[i] corresponds
            to value of the underlying experiment
            for input parameters X[i]

        Returns
        -------
        result : model
                Model learnt from incoming input
                inputs and outputs

        """
        clf = SVR(C=1.0, epsilon=0.2)
        clf.fit(X, y)
        return clf
示例#15
0
def svm_regression(y, x=None, ker='rbf', opt=0.1, show=False):
  """
  Pass an array, with or without x-axis values, and this returns the SVM.
  A kernel (ker) can also be specified: 'rbf' 'linear', 'poly'
  """
  from sklearn.svm import SVR
  if x is None: # Assume linearly spaced points
    x = np.arange(0,len(y))
  # Fit the regression model
  if ker == 'linear':
    svr = SVR(kernel=ker, C=1e3)
  elif ker == 'poly':
    if type(opt) is not int:
      print('Need a degree for a polynomial fit, not' + str(opt))
      return None
    svr = SVR(kernel=ker, C=1e3, degree=opt)
  else:
    svr = SVR(kernel='rbf', C=1e3, gamma=opt) # default is radial basis func
  y_svr = svr.fit(x, y).predict(x) # Fit
  
  # And plot if requested
  if show:
    plt.scatter(x, y, c='k', label='data')
    plt.plot(x, y_svr, c='b', label='SVR model')
    plt.xlabel('data')
    plt.ylabel('target')
    plt.title('Support Vector Regression')
    plt.legend()
    plt.show()
  return y_svr
示例#16
0
def rollingMeanScale(series, period, plotAxis=False):
    svr_rbf = SVR(kernel='rbf', C=1e4, gamma=.01, epsilon=.01)
    '''Fit Model to Data Series'''
    tS= numpy.array([series.index]).T
    y_rbf = svr_rbf.fit(tS, list(series))
    '''Up-sample to get rid of bias'''
    fFit = arange(series.index[0],series.index[-1]+.1,.25)
    trend = y_rbf.predict(numpy.array([fFit]).T)
    
    '''Take rolling mean over 1-day window'''
    shift = int(round(period/.5))
    rMean = pandas.rolling_mean(trend, shift*2)
    rMean = numpy.roll(rMean, -shift)
    rMean[:shift]=rMean[shift]
    rMean[-(shift+1):]=rMean[-(shift+1)]
    rMean = pandas.Series(rMean, index=fFit)
    
    '''Adjust Data Series by subtracting out trend'''
    series = series - array(rMean[array(series.index, dtype=float)])
    series = scaleMe(series)-.5
    
    if plotAxis:
        plotAxis.plot(fFit, trend, label='Series Trend')
        plotAxis.plot(fFit, rMean, label='Rolling Mean')
        plotAxis.set_title('Detrend the Data')
        plotAxis.legend(loc='lower left')

    return series
def machinelearning(csv_file):
  # parse CSV
  d = {}
  d['date'] = []
  d['radiation'] = []
  d['humidity'] = []
  d['temperature'] = []
  d['wind'] = []
  d['demand'] = []

  dictreader = csv.DictReader(csv_file, fieldnames=['date', 'radiation', 'humidity', 'temperature', 'wind', 'demand'], delimiter=',')

  next(dictreader)
  for row in dictreader:
    for key in row:
      d[key].append(row[key])

  # interpolate weather data
  interpolate(d['radiation'])
  interpolate(d['humidity'])
  interpolate(d['temperature'])
  interpolate(d['wind'])

  # train machine learning algorithm
  training_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[:32])
  training_y = np.array(d['demand'][:32])

  poly_svr = SVR(kernel='poly', degree=2)
  poly_svr.fit(training_x, training_y)

  prediction_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[32:])
  demand_predictions = poly_svr.predict(prediction_x)

  return demand_predictions
示例#18
0
def train_SVR(viper):

	from sklearn.svm import SVR
	model = SVR(C=10, kernel='rbf', shrinking=False, verbose=True)
	model.fit(viper.train_feat, viper.train_y)

	return model
示例#19
0
def Sand_SVR(X_train, Y_train, X_test, Y_test, cv_iterator):
    
    #===========================================================================
    # param_grid = {'C':[100,500,1000, 5000, 10000, 100000],
    #               'epsilon':[0.075,0.1, 0.125]
    #               }
    #  
    # svr = SVR(cache_size = 1000, random_state=42)
    # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", cv=cv_iterator)
    #===========================================================================
    #search.fit(X_train, Y_train["Sand"])
    #search.grid_scores_
    
    #svr = search.best_estimator_ 
    #svr.fit(X_train, Y_train["SAND"])
    
    #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator)
    
    svr = SVR(C=10000)
    svr.fit(X_train, Y_train["Sand"])
    
    yhat_svr = svr.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["Sand"], yhat_svr))
    
    return svr, test_error
示例#20
0
def CaSVRModel(X_train, Y_train, X_test, Y_test, cv_iterator):
#     
#     param_grid = {'C':[10000],
#                    'epsilon':[0.001, 0.01, 0.05, 0.1, 0.15, 1]
#                    }
#       
#     svr = SVR(random_state=42, cache_size=1000, verbose=2)
#     search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs= 1, iid=True, cv=cv_iterator)
#     search.fit(X_train, Y_train["Ca"])
#     #search.grid_scores_
#       
#     model = search.best_estimator_

    #scaler = StandardScaler()

    model = SVR(C=10000, epsilon = 0.01, cache_size=1000)
    model.fit(X_train, Y_train["Ca"])
    #model.fit(X_train, Y_train["Ca"])
    
    #model.fit(X_train, Y_train["Ca"])
    
    #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator)
    
    yhat_svr = model.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_svr))
    
    return model, test_error
def test_regression():

    X, y = make_regression(n_samples=1000,
                           n_features=5,
                           n_informative=2,
                           n_targets=1,
                           random_state=123,
                           shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123)

    svm = SVR(kernel='rbf')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric='r2',
        num_rounds=1,
        seed=123)

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert imp_vals[0] > 0.2
    assert imp_vals[1] > 0.2
    assert sum(imp_vals[3:]) <= 0.01
示例#22
0
def learn(X, y):
    # do pca
    pca = PCA(n_components=6)
    pca_6 = pca.fit(X)

    print('variance ratio')
    print(pca_6.explained_variance_ratio_)
    X = pca.fit_transform(X)

    # X = np.concatenate((X_pca[:, 0].reshape(X.shape[0], 1), X_pca[:, 5].reshape(X.shape[0], 1)), axis=1)
    # do svr
    svr_rbf = SVR(kernel='rbf', C=1)
    svr_rbf.fit(X, y)
    # print(model_rbf)

    y_rbf = svr_rbf.predict(X)
    print(y_rbf)
    print(y)

    # see difference
    y_rbf = np.transpose(y_rbf)
    deviation(y, y_rbf)

    # pickle model
    with open('rbfmodel.pkl', 'wb') as f:
        pickle.dump(svr_rbf, f)

    with open('pcamodel.pkl', 'wb') as f:
        pickle.dump(pca_6, f)
示例#23
0
def train_SVM(X, Y, kernel='rbf', shrinking=True,  tol=0.001, cache_size=1500, verbose=True, max_iter=-1):
	"""Assumes all irrelevant features have been removed from X and Y"""
	"""Learns several hundred SVMs"""

	clf = SVR(kernel=kernel, tol=tol, cache_size=cache_size, verbose=verbose, max_iter=max_iter)
	pipeline = Pipeline(zip([ "imputate", "vart", "scale", "svm" ], [ Imputer(), VarianceThreshold(), StandardScaler(), clf ]))
	
	param_grid = dict(svm__C=[0.1, 1, 10, 100, 1000],
										svm__gamma=[0.001, 0.01, 1, 10])

	
	grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3)
	
	results = []

	for i in range(Y[0].shape[1]):
		Y_new = np.fromiter((x[:, i][0, 0] for x in Y), np.double)
		X_new = np.array([np.matrix(x.data).flatten().tolist() for x in X], np.double)
		#X_new = np.fromiter((np.matrix(x.data) for x in X), np.double)

		X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_new, Y_new, test_size = 0.2)
		X_train = flatten(X_train)
		X_test = flatten(X_test)

		grid_search.fit(X_train, Y_train)
		results.append( (grid_search.best_estimator_, clf.score(X_test, Y_test)))	
		print("Best estimators (C): {0}, Score: {1}".format(grid_search.best_estimator_, clf.score(X_test, Y_test)))
	return results
示例#24
0
def train_model(train, test, labels):
    clf = SVR(C=1.0, epsilon=0.2)
    clf.fit(train, labels)
    #clf = GaussianNB()
    #clf.fit(train, labels)
    print "Good!"
    predictions = clf.predict(test)
    print predictions.shape
    predictions = pd.DataFrame(predictions, columns = ['relevance'])
    print "Good again!"
    print "Predictions head -------"
    print predictions.head()
    print predictions.shape
    print "TEST head -------"
    print test.head()
    print test.shape
    test['id'].to_csv("TEST_TEST.csv",index=False)
    predictions.to_csv("PREDICTIONS.csv",index=False)
    #test = test.reset_index()
    #predictions = predictions.reset_index()
    #test = test.groupby(level=0).first()
    #predictions = predictions.groupby(level=0).first()
    predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False)
    print predictions
    return predictions
class SVRegression:
    def __init__(self, kernel_value, c_value, iter_value):
        self.kernel = kernel_value
        self.c = c_value
        self.iter = iter_value
        self.svr_lin = None
    
    def fit_predict(self, x_train, y_train, x_test):
        self.svr_lin = SVR(kernel=self.kernel, C=self.c, max_iter=self.iter)
        y_lin = self.svr_lin.fit(x_train, y_train).predict(x_test)
        return y_lin
    
    def computeC(self, x_train):
        print "ARRAY ", type(x_train)
        print x_train
        array = x_train.todense()
        print "ARRAY ", type(array)
        print array
        result = array.sum(axis=1, dtype='float')
        result = pow(result, 2)
        total = result.sum(axis=0, dtype='float')
        rows, columns = x_train.shape
        total = float(total)/float(rows)
        total = pow(total,-1)
        print "C", total
        self.c = total

    def computeAccuracy(self, x, y):
        return self.svr_lin.score(x, y)
def compute_mse(regressor, horizon):
    # get wind park and corresponding target. 
    windpark = NREL().get_windpark(NREL.park_id['tehachapi'], 3, 2004, 2005)
    target = windpark.get_target()

    # use power mapping for pattern-label mapping. 
    feature_window = 3
    mapping = PowerMapping()
    X = mapping.get_features_park(windpark, feature_window, horizon)
    y = mapping.get_labels_turbine(target, feature_window, horizon)

    # train roughly for the year 2004, test for 2005.
    train_to = int(math.floor(len(X) * 0.5))
    test_to = len(X)
    train_step, test_step = 25, 25
    X_train=X[:train_to:train_step]
    y_train=y[:train_to:train_step]
    X_test=X[train_to:test_to:test_step]
    y_test=y[train_to:test_to:test_step]

    if(regressor == 'svr'):
        reg = SVR(kernel='rbf', epsilon=0.1, C = 100.0,\
                gamma = 0.0001).fit(X_train,y_train)
        mse = mean_squared_error(reg.predict(X_test),y_test)
    elif(regressor == 'knn'):
        reg = KNeighborsRegressor(10, 'uniform').fit(X_train,y_train)
        mse = mean_squared_error(reg.predict(X_test),y_test)
    return mse
示例#27
0
    def RunSVRScikit(q):
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      # Use the last row of the training set as the responses.
      X, y = SplitTrainData(self.dataset)

      # Get all the parameters.
      c = re.search("-c (\d+\.\d+)", options)
      e = re.search("-e (\d+\.\d+)", options)
      g = re.search("-g (\d+\.\d+)", options)

      C = 1.0 if not c else float(c.group(1))
      epsilon = 1.0 if not e else float(e.group(1))
      gamma = 0.1 if not g else float(g.group(1))

      try:
        with totalTimer:
          # Perform SVR.
          model = SSVR(kernel='rbf', C=C, epsilon=epsilon, gamma=gamma)
          model.fit(X, y)
      except Exception as e:
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
def train_svm(train_file, avg={}):
    test_X, test_Y, weight = load_data(train_file, avg)
    svr = SVR(kernel='rbf', C=100, gamma=1, verbose=True, cache_size=1024)
    print("start train")
    svr.fit(test_X, test_Y)
    print("train finish")
    return svr
示例#29
0
def train_svm(data):
    test_X, test_Y = load_data(data)
    svr = SVR(kernel='rbf', C=100, gamma=1)
    print("start train")
    svr.fit(test_X, test_Y)
    print("train finish")
    return svr
示例#30
0
# This method is useful because you can use it on any model and on out-of-sample data.

vi2 = permutation_vi(mod_rf, test_X, test_y)

(ggplot(vi2.melt(), aes(y="value", x='variable')) + geom_boxplot() +
 coord_flip() + ylim(0, 10))

# %% -----------------------------------------

# Let's use a completely different class of model, and the
# method still works. This is the model "agnostic" bit.

from sklearn.svm import SVR

mod_svr = SVR()
mod_svr.fit(X, y)

vi3 = permutation_vi(mod_svr, X, y)

(ggplot(vi3.melt(), aes(y="value", x='variable')) + geom_boxplot() +
 coord_flip() + ylim(0, 10))

# %% -----------------------------------------

# Problematic when features are highly correlated.

# Set seed
np.random.seed(123)

# Generate correlated predictors
示例#31
0
# Fitting SVR to the dataset
'''
Gaussian RBF(Radial Basis Function) is another popular Kernel method used in 
SVM models for more. RBF kernel is a function whose value depends on the 
distance from the origin or from some point. 
'''
### Feature scaling is necessary in SVR model, because he doesn't to do this automatically
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
sc_Y=StandardScaler()
X=sc_X.fit_transform(X)
y=sc_Y.fit_transform(y)

from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(sc_X.fit_transform(np.array[[6.5]]))
y_pred = regressor.predict(X)
y_pred = sc_Y.inverse_transform(y_pred)

# Visualising the SVR results
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
 Predicting a new result
reg.predict(X_test)SLR and MLR



# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
poly_reg.fit(X_poly, y)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y)

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)

# Fitting Decision tree to dataset

from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(X,y)
regressor.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))


# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
def main():
    # Load data and run brief analysis on it
    raw_data = load_data('train.csv')
    quick_analysis(raw_data)

    plt.hist(raw_data['SalePrice'])
    plt.show()

    # View all unique values of categorical features
    non_numeric_cols = raw_data.loc[:, raw_data.dtypes == object]

    for col in non_numeric_cols.columns:
        print(non_numeric_cols[col].value_counts())

    # Analize correlations between features and the label
    corr_matrix = raw_data.corr()
    sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False)
    print(sale_correl)

    # Feature engineering the following:
    #   Grade = OverallQual / OverallCond
    #   Age = YrSold - YearBuilt
    #   RemodAge = YrSold - YearRemodAdd
    #   TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF

    raw_data['Grade'] = raw_data['OverallQual'] / raw_data['OverallCond']
    raw_data['Age'] = raw_data['YrSold'] - raw_data['YearBuilt']
    raw_data['RemodAge'] = raw_data['YrSold'] - raw_data['YearRemodAdd']
    raw_data['TotalSF'] = raw_data['TotalBsmtSF'] + raw_data[
        '1stFlrSF'] + raw_data['2ndFlrSF']

    # Correlation matrix for the new features
    corr_matrix = raw_data.corr()
    sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False)
    print(sale_correl)

    # Check correlation of new features with their respective components
    age_correl = corr_matrix['Age'].sort_values(ascending=False)
    print('Age correlations:', age_correl, '\n')

    remod_age_correl = corr_matrix['RemodAge'].sort_values(ascending=False)
    print('RemodAge correlations:', remod_age_correl, '\n')

    grade_correl = corr_matrix['Grade'].sort_values(ascending=False)
    print('Grade correlations:', grade_correl, '\n')

    totalsf_correl = corr_matrix['TotalSF'].sort_values(ascending=False)
    print('TotalSF correlations:', totalsf_correl, '\n')

    # Correlation matrix vizualization
    corr_plot(raw_data, 'SalePrice', fig_size=(4, 4))
    corr_plot(raw_data, 'SalePrice', plot_type='hist', fig_size=(4, 4))

    # Change type of columns to reflect their nature. Concretely, change the YrSold, MoSold, MSZoning and OverallCond features to categorical ones
    raw_data['YrSold_C'] = raw_data['YrSold'].copy().astype(str)
    raw_data['MoSold'] = raw_data['MoSold'].astype(str)
    raw_data['MSZoning'] = raw_data['MSZoning'].astype(str)
    raw_data['OverallCond_C'] = raw_data['OverallCond'].copy().astype(str)

    num_cols = [
        'OverallQual',
        'OverallCond',
        'YearBuilt',
        'YearRemodAdd',
        'TotalBsmtSF',
        '1stFlrSF',
        '2ndFlrSF',
        'GarageCars',
        'GarageArea',
        'FullBath',
        'YrSold',
    ]
    cat_cols = [
        'MSZoning',
        'Street',
        'Utilities',
        'Neighborhood',
        'ExterQual',
        'ExterCond',
        'BsmtQual',
        'BsmtCond',
        'Heating',
        'CentralAir',
        'PavedDrive',
        'SaleType',
        'SaleCondition',
        'YrSold_C',
        'MoSold',
        'OverallCond_C',
    ]

    # Create a list of all values that the categorical features can take
    cat_cols_categs = [raw_data[col].unique() for col in cat_cols]
    print(cat_cols_categs)

    # Create the pipeline to process data
    num_pipeline = Pipeline([
        ('feat_sel', FeatureSelector(num_cols, True)),
        ('Grade',
         FeatureCreator(['OverallCond', 'OverallQual'],
                        lambda x, y: x / y,
                        as_dataframe=True,
                        feat_name='Grade')),
        ('Age',
         FeatureCreator(['YrSold', 'YearBuilt'],
                        lambda x, y: x - y,
                        as_dataframe=True,
                        feat_name='Age')),
        ('RemodAge',
         FeatureCreator(['YrSold', 'YearRemodAdd'],
                        lambda x, y: x - y,
                        as_dataframe=True,
                        feat_name='RemodAge')),
        ('TotalSF',
         FeatureCreator(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'],
                        lambda x, y: x + y,
                        as_dataframe=True,
                        feat_name='TotalSF')),
        ('drop_cat_feat',
         FeatureDropper(['YrSold', 'OverallCond'], as_dataframe=True)),
        ('imputer_mean', Imputer(strategy='mean')),
        ('std_scaler', RobustScaler())
    ])

    cat_pipeline = Pipeline([
        ('feat_sel', FeatureSelector(cat_cols, True)),
        ('imputer_most_frequent', CategoricalImputer()),
        ('encode', OneHotEncoder(categories=cat_cols_categs, sparse=False)),
    ])
    feat_union = FeatureUnion(transformer_list=[
        ('num_features', num_pipeline),
        ('cat_features', cat_pipeline),
    ])

    # Create the train data and labels
    train_labels = raw_data['SalePrice'].copy()
    train_feat = feat_union.fit_transform(raw_data)

    # Check the linear regression model
    lin_reg = LinearRegression()
    print('Linear regression best hyperparameters:')
    final_lr_model = find_best_estimator(lin_reg, [{}], train_feat,
                                         train_labels)

    # Check the decision tree model
    hyperparams_vals = [
        {
            'max_features': [6, 10, 12, 16, 18, 20, 24]
        },
    ]

    dt_reg = DecisionTreeRegressor(random_state=42)
    print('Decision tree best hyperparameters:')
    final_dt_model = find_best_estimator(dt_reg, hyperparams_vals, train_feat,
                                         train_labels)

    # Check the random forest model
    hyperparams_vals = [
        {
            'n_estimators': [200, 225, 250],
            'max_features': [16, 24, 30]
        },
        {
            'bootstrap': [False],
            'n_estimators': [220, 225],
            'max_features': [24, 28]
        },
    ]

    forest_reg = RandomForestRegressor(n_jobs=-1, random_state=42)
    print('Random forest best hyperparameters:')
    final_rf_model = find_best_estimator(forest_reg, hyperparams_vals,
                                         train_feat, train_labels)

    # Check the XGBoost model
    hyperparams_vals = [
        {
            'n_estimators': [450, 500, 400],
            'max_features': [2, 4, 8],
            'max_depth': [3, 4, None]
        },
    ]

    xgbr_reg = XGBRegressor(learning_rate=0.05, n_threads=-1, random_state=42)
    print('XGBoost regressor best hyperparameters:')
    final_xgb_model = find_best_estimator(xgbr_reg, hyperparams_vals,
                                          train_feat, train_labels)

    # Check the SVM model
    hyperparams_vals = [
        {
            'kernel': ['linear', 'sigmoid', 'rbf'],
            'gamma': ['auto', 'scale']
        },
        {
            'kernel': ['poly'],
            'gamma': ['auto', 'scale'],
            'degree': [3, 4, 5]
        },
    ]

    svm_reg = SVR()
    print('Support vector machine best hyperparameters:')
    final_svm_model = find_best_estimator(svm_reg, hyperparams_vals,
                                          train_feat, train_labels)

    # Check the ElasticNet model
    hyperparams_vals = [
        {
            'alpha': [0.0005, 0.005, 0.05, 0.2],
            'l1_ratio': [0.1, 0.25, 0.75, 0.9]
        },
    ]

    enet_reg = ElasticNet(max_iter=100000000, tol=0.001)
    print('ElasticNet best hyperparameters:')
    final_enet_model = find_best_estimator(enet_reg, hyperparams_vals,
                                           train_feat, train_labels)

    # Check the feature importances for both random forest algorithms
    rf_feat_imp = final_rf_model.feature_importances_
    xgb_feat_imp = final_xgb_model.feature_importances_

    other_feat = ['Grade', 'RemodAge', 'TotalSF']
    all_features = num_cols.copy()
    print(num_cols)
    for cat_values in cat_cols_categs.copy():
        all_features.extend(cat_values)
    all_features.extend(other_feat.copy())

    print('Random forest feature importances:')
    for feat in sorted(zip(rf_feat_imp, all_features), reverse=True):
        print(feat)

    print('\nXGBoost feature importances:')
    for feat in zip(xgb_feat_imp, all_features):
        print(feat)

    # Load and process test data
    test_data = load_data('test.csv')
    test_data['YrSold_C'] = test_data['YrSold'].copy().astype(str).replace(
        'nan', None)
    test_data['MoSold'] = test_data['MoSold'].astype(str).replace('nan', None)
    test_data['MSZoning'] = test_data['MSZoning'].astype(str).replace(
        'nan', None)
    test_data['OverallCond_C'] = test_data['OverallCond'].copy().astype(
        str).replace('nan', None)
    test_feat = feat_union.transform(test_data)

    # Predict using the combination of Random Forest and XGBoost
    rf_predictions = final_rf_model.predict(test_feat)
    xgb_predictions = final_xgb_model.predict(test_feat)
    predictions = rf_predictions * 0.35 + xgb_predictions * 0.65

    # Save resulting predictions
    pred_df = pd.DataFrame()
    pred_df['Id'] = test_data['Id']
    pred_df['SalePrice'] = predictions.flatten()

    print(pred_df)
    pred_df.to_csv('submission_rf_xgb.csv', index=False)

    # Predict using only the XGBoost model
    xgb_predictions = final_xgb_model.predict(test_feat)
    predictions = xgb_predictions.copy()

    pred_df = pd.DataFrame()
    pred_df['Id'] = test_data['Id']
    pred_df['SalePrice'] = predictions.flatten()

    print(pred_df)
    pred_df.to_csv('submission_xgb.csv', index=False)
示例#34
0
from sklearn.externals import joblib
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet, Lasso
from sklearn.svm import SVR
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
import os

default_model_dir = 'models'

_models = {
    'bayesian_ridge': BayesianRidge(),
    'linear_regression': LinearRegression(),
    'elastic_net': ElasticNet(),
    'lasso': Lasso(),
    'svr': SVR(kernel='linear'),
    'gbr': GradientBoostingRegressor(n_estimators=300, max_depth=5)
}


def get_model_names():
    """
    Get supported model names.

    :return:
    """
    return list(_models.keys())


def get_models(model_name):
    """
    Get models.
示例#35
0
# Add noise to targets
# y[::5]는 1차원 배열에서 5배수 번째 인덱스에만 특정 랜덤 값을 더해줌
y[::5] += 3 * (0.5 - np.random.rand(8))
print(y)
# [ 0.04361009  0.17796574  0.22978773  0.24928643  0.32014619  0.13695542
#   0.70427365  0.72169941  0.78309245  0.80656999  0.3792032   0.9218538
#   0.96352582  0.99939807  0.9366527   1.22007951  0.86145011  0.78439525
#   0.72848344  0.65509942 -0.91410799  0.37470255  0.27513696  0.24822033
#   0.09237645  0.42416063  0.01079613 -0.06667189 -0.07494893 -0.22322095
#  -0.86223429 -0.54825618 -0.59995522 -0.85384305 -0.98249348 -2.24695741
#  -0.99667893 -0.99887435 -0.99247294 -0.96955196]

# #############################################################################
# Fit regression model
# support vector machine regression(kinds of regression)
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=4)

y_rbf = svr_rbf.fit(X, y).predict(X)
y_lin = svr_lin.fit(X, y).predict(X)
y_poly = svr_poly.fit(X, y).predict(X)

# #############################################################################
# Look at the results
lw = 2
plt.scatter(X, y, color='darkorange', label='data')
plt.plot(X, y_rbf, color='navy', lw=lw, label='RBF model')
plt.plot(X, y_lin, color='c', lw=lw, label='Linear model')
plt.plot(X, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model')
plt.xlabel('data')
示例#36
0
def ml():
    """Function tasked with running the machine learning alglorythm

    """
    import numpy 
    from sklearn import linear_model
    from sklearn.metrics import mean_squared_error, r2_score
    import statsmodels.api as sm
    from sklearn.model_selection import train_test_split
    
    resp=[] #define response list
    
    def convert(): # convert the dataset into a final dataset file by removing the lines containing empty ratings
        out=open("data/new_dataoutput.txt","w")
        f = open("data/testoutput.txt")
        for line in f:         
            li=line.split("|")
            if li[2]=='' or li[0]=='':
                continue
            else:
                out.write(line) # write new dataset
                resp.append(float(li[2])) # import response into list
        f.close()
        out.close()        
    
    convert()
    
    #export features from new (final) dataset file
    data=numpy.loadtxt("data/new_dataoutput.txt",delimiter="|", usecols = (3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40))
    
    train_Y=[]
    test_Y=[]
    
    
    #Random split using built-in function:
    train_X, test_X, train_Y, test_Y = train_test_split(data, resp, test_size=0.5, random_state=42)
    
    #Ordinary Least Squares:
    reg = linear_model.LinearRegression()
    reg.fit(train_X,train_Y) #Fit the model
    pred_Y = reg.predict(test_X) #Make predictions
    print("\nOrdinary Least Squares prediction:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y)) #Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y)) #Calculates R2 of predictions
    
    # Analysis of the linear coefficients significance (least squares regression):
    X2 = sm.add_constant(train_X)
    est = sm.OLS(train_Y, X2)#Define the model
    est2 = est.fit()#Fit the model
    print(est2.summary())
    
    #Ridge regression
    reg = linear_model.Ridge(alpha=.5)
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nRidge Regression prediction:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3)
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nRidge Regression with Generalized Cross-Validation prediction:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    reg = linear_model.Lasso(alpha=0.1)
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nLasso Model:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    reg = linear_model.Lars(n_nonzero_coefs=1)
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nLeast Angle Regression (LARS) Model:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    reg = linear_model.LassoLars(alpha=.1)
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nLARS Lasso Model:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    reg = linear_model.BayesianRidge()
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nBayesian Ridge Regressor predictions:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    #SVR:
    from sklearn.svm import SVR
    reg = SVR(kernel='rbf', C=1e3, gamma=0.1)
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nSupported Vector Machine Regressor predictions:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    from sklearn import tree
    reg=tree.DecisionTreeRegressor()
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    #print(pred_Y,resp)
    print("\nDecision Tree Regressor predictions:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    from sklearn.ensemble import AdaBoostRegressor
    from sklearn.tree import DecisionTreeRegressor
    
    rng = numpy.random.RandomState(1)
    reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
    reg.fit(train_X,train_Y)#Fit the model
    pred_Y = reg.predict(test_X)#Make predictions
    print("\nAda Boost Regressor predictions:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    #Dimensionality reduction with PCA and then linear regression
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    pca_data=pca.fit_transform(data) #Do PCA dimensional reduction
     #Random split using built-in function:
    pca_train_X, pca_test_X, train_Y, test_Y = train_test_split(pca_data, resp, test_size=0.5, random_state=42)
    reg = linear_model.LinearRegression() # Do least squares
    reg.fit(pca_train_X,train_Y) #Fit the model
    pred_Y = reg.predict(pca_test_X)#Make predictions
    print("\nLeast squares with features compressed into 2 principal components:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
    
    #Add polynomial feautres
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=3)
    a=poly.fit_transform(train_X) #add polynomial feautures to training set
    b=poly.fit_transform(test_X) #add polynomial features to testing set
    reg = linear_model.LinearRegression() #Least squares
    reg.fit(a,train_Y)#Fit the model
    pred_Y = reg.predict(b)#Make predictions
    print("\nLeast squares with polynomial features:")
    print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions
    print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
示例#37
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('Position_Salaries.csv')
x = dataset.iloc[:, 1].values
y = dataset.iloc[:, 2].values

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x.reshape(len(x), 1))
y = sc_y.fit_transform(y.reshape(len(y), 1))

from sklearn.svm import SVR
regressor = SVR(kernel='rbf', degree=3)
regressor.fit(x, y)

# regressor.predict()
y_pred = sc_y.inverse_transform(
    regressor.predict(sc_y.transform(np.array([[6.5]]))))
print(y_pred)

plt.scatter(x, y, color='red')
#plt.plot(x, y)
#plt.plot(x, y, marker='o', markersize=3, color="red")
#plt.plot(x, y_pred, color = 'black')
plt.plot(x, regressor.predict(x), color='blue')
plt.title('Truth or Bloof (SVR module) ')
plt.xlabel('Position')
plt.ylabel('Salary')
示例#38
0
    for i in range(len(df_total)):
        Y11.append(abs(Y1[i][0]))

    #getting rid of 0`s
    for i in range(len(Y11)):
        if Y11[i] > 1 and Y11[i] < 121:
            a = Y11[i]
            Y.append(a)

    for i in range(len(Y)):
        X.append([i])

    print X

    #initializes the model
    svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)

    X_predicted = []
    Y_predicted = []

    #setting total length to iterate. as i is iterated through total length and
    #when we use (i+24) it should be in data
    total = len(X) - TrainingWindow - 2

    #iterations of i for every n, where n is window size. 3 in this case
    PredictionWindowArray = []
    MAPE = []
    TrueError = []
    PredictionWindow = 2
    i = 0
示例#39
0
X_train, X_test, y_train, y_test = train_test_split(train_X_reduced,
                                                    train_y,
                                                    test_size=0.2)

##################

np.set_printoptions(precision=4)
pd.set_option('precision', 4)

model_ridge = Ridge(alpha=12.0, random_state=seed)
model_KRR = KernelRidge(alpha=0.2,
                        kernel='polynomial',
                        degree=2,
                        coef0=2.0,
                        gamma=0.0032)
model_svr = SVR(C=44.73, epsilon=0.0774, gamma=0.0004, kernel='rbf')
model_byr = BayesianRidge()
model_ENet = ElasticNet(alpha=0.0001,
                        l1_ratio=0.551,
                        random_state=seed,
                        max_iter=10000)
model_lasso = Lasso(alpha=0.0004, random_state=seed)
model_lsvr = LinearSVR(C=0.525, epsilon=0.04, random_state=seed)
model_lasso_lars = LassoLars(alpha=1.22e-05)

model_rforest = RandomForestRegressor(n_estimators=300,
                                      max_features=0.4,
                                      min_samples_split=4,
                                      random_state=seed)

model_GBoost = GradientBoostingRegressor(n_estimators=2000,
# Initialize models

clf_line = LinearRegression()
clf_ridg = Ridge(alpha=300, tol=1e-05, solver='sparse_cg', max_iter=5000)
clf_laso = Lasso(alpha=0.1, tol=1e-05, max_iter=5000)
clf_lala = LassoLars(alpha=0.001, max_iter=5000)
clf_enet = ElasticNet(alpha=0.1, tol=0.001, l1_ratio=0.2, max_iter=5000)

clf_xgbr = xgb.XGBRegressor() # not yet
clf_xgrf = xgb.XGBRFRegressor() # not yet

# clf_rf = RandomForestRegressor(criterion='mae', max_features='sqrt', n_estimators=200, max_depth=10)
clf_tree = ExtraTreesRegressor(criterion='mae', max_features='sqrt', n_estimators=200, max_depth=10)
clf_ada = AdaBoostRegressor(n_estimators=3, loss='linear')
# clf_grad = GradientBoostingRegressor() # not yet
clf_svr = SVR(kernel='rbf', C=0.1)

# ori 5
# base_model_name = ['RandomForest', 'ExtraTree', 'AdaBoost', 'GradientBoosting', 'SVR']
# base_model_list = [clf_rf, clf_tree, clf_ada, clf_grad, clf_svr]

# new 5
base_model_name = ['Ridge', 'SVR', 'XgbReg', 'ExtraTree', 'AdaBoost']
base_model_list = [clf_ridg, clf_svr, clf_xgbr, clf_tree, clf_ada]


# base_model_name = ['LinearReg', 'Ridge', 'Lasso', 'LassoLars', 'ElasticNet', 'XgbReg', 'XgbRf', 'ExtraTree', 'AdaBoost', 'SVR']
# base_model_list = [clf_line, clf_ridg, clf_laso, clf_lala, clf_enet, clf_xgbr, clf_xgrf, clf_tree, clf_ada, clf_svr]

# base_model_name = ['LinearReg', 'Ridge', 'Lasso', 'LassoLars', 'ElasticNet', 'Xgb', 'RandomForest', 'ExtraTree', 'AdaBoost', 'GradientBoosting', 'SVR']
# base_model_list = [clf_line, clf_ridg, clf_laso, clf_lala, clf_enet, clf_bxgb, clf_rf, clf_tree, clf_ada, clf_grad, clf_svr]
示例#41
0
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score

n = 15  # number of iterations of the training and testing process
scaler = StandardScaler()
linear_regression = LinearRegression()
forest = RandomForestRegressor()
boosting = GradientBoostingRegressor(random_state=60)
regressor = SVR(
    kernel='linear'
)  # kernels are rbf, linear and polynomial out of which polynomial kernel gives the highest MAE and MSE

warnings.filterwarnings("ignore")
pd.set_option('display.width', 10000000)
pd.set_option('display.max_columns', 10000000)
# pd.set_option('display.max_rows', 10000000)

DataSet = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")

# Wii sports is not a game, it's bundle of games that sold arround 82.53 million
# copies, which is much higher than any other game in the dataset,
# This will be a huge outlier and hence affects accuracy of any model we hence remove it
DataSet.drop(index=[0], inplace=True)
DataSet.drop(
    'Developer', axis=1, inplace=True
示例#42
0
dataset = pd.read_csv("Position_Salaries.csv")
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values


#feature scalling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(-1,1).astype(float))

#fitting the svr model to the dataset
#create regressor
from sklearn.svm import SVR
regressor = SVR(kernel = "rbf")
regressor.fit(X,y)



#predicting a new result using svr
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]])))) 



#visualing the svr result
X_grid = np.arange(min(X), max(X),0.1)
X_grid = X_grid.reshape((len(X_grid),1))
plt.scatter(X, y, color='red')
plt.plot(X_grid, regressor.predict(X_grid),color="blue")
plt.title("regression model")
X_train , X_test , Y_train , Y_test = train_test_split(X,Y ,test_size=0.2, random_state= 0
"""
# Feature Scaling 
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()#objects
sc_Y = StandardScaler()#objects that are going to scale x and y
X = X.reshape(-1,1)
Y = Y.reshape(-1,1)
X = sc_X.fit_transform(X)#fitting and transforming these to the scale these x and y 
Y = sc_Y.fit_transform(Y)


#fitting the SVR to the dataset
#create our regressor
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X,Y)


#predicting the reults with the polynomial regression
Y_pred = sc_Y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))#we need to transform it as X AND Y were transformed to  fit by feature scaling the  SVR and this is not transformed 
#so we use the sc_x object to , transform it , now making it in array using np library and method array
#we also need to use the inverse transform to get the original scale salary
#if we execute the above line we get the scaled salary
#so we need to inverse sc_y to get the original scale prediction

#visualising the SVR results
plt.scatter(X , Y , color='red')
plt.plot(X, regressor.predict(X),color='blue')#here the lin_rag_2 is still the object of the linear regression class so we need to add something to make predictions of the poly regression class.
#so we add this poly_rag.fit and not x_poly as we need to make it general for any new matrix of features x 
plt.title("Truth or Bluff(SVR)")
示例#44
0
# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(-1, 1))

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict([[6.5]])
y_pred = sc_y.inverse_transform(y_pred)

# Visualising the SVR results
plt.scatter(X, y, color='purple')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the SVR results (for higher resolution and smoother curve)
features = ['High', 'Low', 'Open']
X = preprocessing.scale(df_svr[features])
y = df_svr.Price

# Take first 90% as the train data
n_split = int(len(df_svr) * 0.9)

# Define training and testing
X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]
test_date = df_svr.Date[n_split:]

time_Start = time.time()

# Classifier
clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
clf = clf.fit(X_train, y_train)

# Prediction
prediction = clf.predict(X_test)
petur.print_evaluation(y_test, prediction, "SVR")

time_End = time.time()
print("Seconds to run:", time_End - time_Start)

# In[37]:
#==============================================================================
# Plots
#==============================================================================
# Define time series data
real_price = pd.Series(y_test)
    from sklearn.preprocessing import PolynomialFeatures
    #for degree in range(2, 6):
    #    model = make_pipeline(PolynomialFeatures(degree=degree), linear_model.Ridge())
    #    scores = cross_val_score(model, x_scaled, y, cv=kf, scoring='neg_mean_squared_error')
    #    print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    model = make_pipeline(PolynomialFeatures(degree=3), linear_model.Ridge())
    if(verbose): print('polyl2::Cross validating')
    scores = cross_val_score(model, x_scaled, y, cv=kf, scoring='neg_mean_squared_error')
    scores_map['PolyRidge'] = scores

if(alg == 'svr'):
    from sklearn.svm import SVR
    from sklearn.model_selection import GridSearchCV
    
    if(verbose): print('SVR::Initiating SVR')
    svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
    #grid_sv = GridSearchCV(svr_rbf, cv=kf, param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)}, scoring='neg_mean_squared_error')
    #grid_sv.fit(x_scaled, y)
    #print("Best classifier :", grid_sv.best_estimator_)
    if(verbose): print('SVR::Cross validating')
    scores = cross_val_score(svr_rbf, x_scaled, y, cv=kf, scoring='neg_mean_squared_error')
    scores_map['SVR'] = scores


if(alg == 'tree'):
    from sklearn.tree import DecisionTreeRegressor
    
    desc_tr = DecisionTreeRegressor(max_depth=5)
    #grid_sv = GridSearchCV(desc_tr, cv=kf, param_grid={"max_depth" : [1, 2, 3, 4, 5, 6, 7]}, scoring='neg_mean_squared_error')
    #grid_sv.fit(x_scaled, y)
    #print("Best classifier :", grid_sv.best_estimator_)
import numpy as np

dfTraining = pd.read_csv("DataSetTraining.csv")
dfTesting = pd.read_csv("DataSetTesting.csv")

 
X_train = dfTraining[["anio","mes","dia"]]
y_train=dfTraining.AvgMedicion

X_testing = dfTesting[["anio","mes","dia"]]
y_testing = dfTesting.AvgMedicion


print("-------------------- Normal SVM -------------------------")

clf = SVR(C=1.0, epsilon=0.2)
clf.fit(X_train, y_train) 
SVR(C=1.0, cache_size=200, coef0=0.0,
  degree=3, gamma='auto', kernel='rbf',
  max_iter=-1,  shrinking=True,
  tol=0.001, verbose=False)
scores = cross_val_score(clf, X_train, y_train, cv = 10)
res1=clf.predict(X_testing)

print("---------------------------------------------")
index=0
for element in res1:
    error=(abs((element-y_testing[index]))/y_testing[index])*100
    print('Predicted Value: ', element, ' Real value: ', y_testing[index], " % Error: ", error)
    index=index+1
    
示例#48
0
                    all_columns].rename(columns={
                        "fd_num_" + str(i): "scaled_x",
                        "norm_cells_" + str(i): "norm_y"
                    })
    ],
                           axis=0,
                           ignore_index=True)

X_columns = ["scaled_x"] + [
    "MAX_CONC"
] + X_PubChem_properties + X_targets + X_target_pathway + X_cancer_cell_lines

scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
y_train_drug = train_drug["norm_y"].values

print("Linear SVR")

param_tested_C = [0.01, 0.1, 1, 5, 10, 100, 500]
param_tested_epsilon = [0.001, 0.01, 0.1, 1]
param_grid = dict(C=param_tested_C, epsilon=param_tested_epsilon)

splitter_loo = LeaveOneOut()
grid = GridSearchCV(SVR(kernel="linear"),
                    param_grid=param_grid,
                    cv=splitter_loo,
                    scoring="neg_mean_absolute_error")
grid.fit(Xtrain_drug, y_train_drug)

print("Dataset:4, best C:", grid.best_params_["C"])
print("Dataset:4, best_epsilon", grid.best_params_["epsilon"])
示例#49
0
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=False)

    X_train, X_mean, X_std = normalize(X_train)
    X_test = normalize_test(X_test, X_mean, X_std)

    y_train, y_mean, y_std = normalize(y_train)
    # y_test = normalize_test(y_test, y_mean, y_std)

    # ==============
    # MODEL CREATION
    # ==============

    svr_model = SVR()
    rf_model = RandomForestRegressor(n_estimators=100)
    adb_model = AdaBoostRegressor(n_estimators=100)
    xgb_model = XGBRegressor()

    svr_model.fit(X_train, y_train)
    joblib.dump(
        svr_model,
        path + 'models/' + str(data_interval) + 'min/svr_' + stock + '.pkl')
    # svr_model = joblib.load(path+'models/'+str(data_interval)+'min/svr_'+stock+'.pkl')

    rf_model.fit(X_train, y_train)
    joblib.dump(
        rf_model,
        path + 'models/' + str(data_interval) + 'min/rf_' + stock + '.pkl')
    # rf_model = joblib.load(path+'models/'+str(data_interval)+'min/rf_'+stock+'.pkl')
def save_model(path, aaindex_r2_list, learning_set, validation_set, threshold=5, regressor='pls',
               no_fft=False, train_on_all=False):
    """
    Function Save_Model saves the best -s THRESHOLD models as 'Pickle' files (pickle.dump),
    which can be loaded again for doing predictions. Also, in Save_Model included is the def cross_validation
    -based computing of the k-fold CV performance of the n component-optimized model on all data
    (learning + validation set); by default  k  is 5 (n_samples = 5).
    Plots of the CV performance for the t best models are stored inside the folder CV_performance.
    """
    regressor = regressor.lower()
    try:
        os.mkdir('CV_performance')
    except FileExistsError:
        pass
    try:
        os.mkdir('Pickles')
    except FileExistsError:
        pass

    try:
        os.remove('CV_performance/_CV_Results.txt')
    except FileNotFoundError:
        pass
    file = open('CV_performance/_CV_Results.txt', 'w')
    file.write('5-fold cross-validated performance of top models for validation set across all data.\n\n')
    if no_fft:
        file.write("No FFT used in this model construction, performance represents"
                   " model accuracies on raw encoded sequence data.\n\n")
    file.close()

    for t in range(threshold):
        try:
            idx = aaindex_r2_list[t][0]
            parameter = aaindex_r2_list[t][7]

            # Estimating the CV performance of the n_component-fitted model on all data
            xy_learn = XY(full_path(idx), learning_set)
            xy_test = XY(full_path(idx), validation_set)
            if no_fft is False:
                x_test, y_test, _ = xy_test.get_x_and_y()
                x_learn, y_learn, _ = xy_learn.get_x_and_y()
            else:
                _, y_test, x_test = xy_test.get_x_and_y()
                _, y_learn, x_learn = xy_learn.get_x_and_y()

            x = np.concatenate([x_learn, x_test])
            y = np.concatenate([y_learn, y_test])

            if regressor == 'pls' or regressor == 'pls_cv':
                # n_components according to lowest MSE for validation set
                regressor_ = PLSRegression(n_components=parameter.get('n_components'))

            elif regressor == 'rf':
                regressor_ = RandomForestRegressor(
                    random_state=parameter.get('random_state'),
                    n_estimators=parameter.get('n_estimators'),
                    max_features=parameter.get('max_features')
                )

            elif regressor == 'svr':
                regressor_ = SVR(C=parameter.get('C'), gamma=parameter.get('gamma'))

            elif regressor == 'mlp':
                regressor_ = MLPRegressor(
                    hidden_layer_sizes=parameter.get('hidden_layer_sizes'),
                    activation=parameter.get('activation'),
                    solver=parameter.get('solver'),
                    learning_rate=parameter.get('learning_rate'),
                    learning_rate_init=parameter.get('learning_rate_init'),
                    max_iter=parameter.get('max_iter'),
                    random_state=parameter.get('random_state')
                )

            else:
                raise SystemError("Did not find specified regression model as valid option. "
                                  "See '--help' for valid regression model options.")

            # perform 5-fold cross-validation on all data (on X and Y)
            n_samples = 5
            y_test_total, y_predicted_total = cross_validation(x, y, regressor_, n_samples)

            r_squared = r2_score(y_test_total, y_predicted_total)
            rmse = np.sqrt(mean_squared_error(y_test_total, y_predicted_total))
            stddev = np.std(y_test_total, ddof=1)
            nrmse = rmse / stddev
            pearson_r = np.corrcoef(y_test_total, y_predicted_total)[0][1]
            # ranks for Spearman correlation
            y_test_total_rank = np.array(y_test_total).argsort().argsort()
            y_predicted_total_rank = np.array(y_predicted_total).argsort().argsort()
            spearman_rho = np.corrcoef(y_test_total_rank, y_predicted_total_rank)[0][1]

            with open('CV_performance/_CV_Results.txt', 'a') as f:
                f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format(
                    regressor.upper(), parameter, idx[:-4]))
                f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};'
                        ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho))

            figure, ax = plt.subplots()
            ax.scatter(y_test_total, y_predicted_total, marker='o', s=20, linewidths=0.5, edgecolor='black')
            ax.plot([min(y_test_total) - 1, max(y_test_total) + 1],
                    [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=2)
            ax.legend([
                '$R^2$ = {}\nRMSE = {}\nNRMSE = {}\nPearson\'s $r$ = {}\nSpearman\'s '.format(
                    round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3))
                + r'$\rho$ = {}'.format(str(round(spearman_rho, 3)))
            ])
            ax.set_xlabel('Measured')
            ax.set_ylabel('Predicted')
            plt.savefig('CV_performance/' + idx[:-4] + '_' + str(n_samples) + '-fold-CV.png', dpi=250)
            plt.close('all')

            if train_on_all:
                # fit on all available data (learning + validation set; FFT or noFFT is defined already above)
                regressor_.fit(x, y)
            else:
                # fit (only) on full learning set (FFT or noFFT is defined already above)
                regressor_.fit(x_learn, y_learn)

            file = open(os.path.join(path, 'Pickles/'+idx[:-4]), 'wb')
            pickle.dump(regressor_, file)
            file.close()

        except IndexError:
            break

    return ()
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2:3].values

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)

y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))

示例#52
0
df_y = df.iloc[:, 4].values

# remove NA values

# categorical to continous

# add new columns

# remove columns
df_x = df_x[:, 3:4]
# split the data into train and test
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, train_size=0.8)
# perform the logic
from sklearn.svm import SVR
regressor = SVR()
regressor = regressor.fit(x_train, y_train)
result_re = regressor.predict(x_test)
result_final = []
#result_final.append(0)
for i in range(0, len(result_re)):
    if result_re[i] > 0.4:
        result_final.append(1)
    else:
        result_final.append(0)

result_final = np.asarray(result_final)

#print(result_re)
# consume the result
def get_r2(x_learn, x_valid, y_learn, y_valid, regressor='pls'):
    """
    The function Get_R2 takes features and labels from the learning and validation set.

    When using 'pls' as regressor, the MSE is calculated for all LOOCV sets for predicted vs true labels
    (mse = mean_squared_error(y_test_loo, y_pred_loo) for a fixed number of components for PLS regression.
    In the next iteration, the number of components is increased by 1 (number_of_components += 1)
    and the MSE is calculated for this regressor. The loop breaks if i > 9.
    Finally, the model of the single AAindex model with the lowest MSE is chosen.

    When using other regressors the parameters are tuned using GridSearchCV.

    This function returnes performance (R2, (N)RMSE, Pearson's r) and model parameters.
    """
    regressor = regressor.lower()
    mean_squared_error_list = []

    if regressor == 'pls':
        # PLS regression with LOOCV n_components tuning as described by Cadet et al.
        # https://doi.org/10.1186/s12859-018-2407-8
        # https://doi.org/10.1038/s41598-018-35033-y
        # Hyperparameter (N component) tuning of PLS regressor
        for n_comp in range(1, 10):  # n_comp = 1, 2,..., 9
            pls = PLSRegression(n_components=n_comp)
            loo = LeaveOneOut()

            y_pred_loo = []
            y_test_loo = []

            for train, test in loo.split(x_learn):
                x_learn_loo = []
                y_learn_loo = []
                x_test_loo = []

                for j in train:
                    x_learn_loo.append(x_learn[j])
                    y_learn_loo.append(y_learn[j])

                for k in test:
                    x_test_loo.append(x_learn[k])
                    y_test_loo.append(y_learn[k])

                pls.fit(x_learn_loo, y_learn_loo)
                y_pred_loo.append(pls.predict(x_test_loo)[0][0])

            mse = mean_squared_error(y_test_loo, y_pred_loo)

            mean_squared_error_list.append(mse)

        mean_squared_error_list = np.array(mean_squared_error_list)
        # idx = np.where(...) finds best number of components
        idx = np.where(mean_squared_error_list == np.min(mean_squared_error_list))[0][0] + 1

        # Model is fitted with best n_components (lowest MSE)
        best_params = {'n_components': idx}
        regressor_ = PLSRegression(n_components=best_params.get('n_components'))

    # other regression options (CV tuning)
    elif regressor == 'pls_cv':
        params = {'n_components': list(np.arange(1, 10))}  # n_comp = 1, 2,..., 9
        regressor_ = GridSearchCV(PLSRegression(), param_grid=params, iid=False, cv=5)  # iid in future
                                                                                        # versions redundant
    elif regressor == 'rf':
        params = {                 # similar parameter grid as Xu et al., https://doi.org/10.1021/acs.jcim.0c00073
            'random_state': [42],  # state determined
            'n_estimators': [100, 250, 500, 1000],  # number of individual decision trees in the forest
            'max_features': ['auto', 'sqrt', 'log2']  # “auto” -> max_features=n_features,
            # “sqrt” -> max_features=sqrt(n_features) “log2” -> max_features=log2(n_features)
        }
        regressor_ = GridSearchCV(RandomForestRegressor(), param_grid=params, iid=False, cv=5)

    elif regressor == 'svr':
        params = {                      # similar parameter grid as Xu et al.
            'C': [2 ** 0, 2 ** 2, 2 ** 4, 2 ** 6, 2 ** 8, 2 ** 10, 2 ** 12],  # Regularization parameter
            'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]  # often 1 / n_features or 1 / (n_features * X.var())
        }
        regressor_ = GridSearchCV(SVR(), param_grid=params, iid=False, cv=5)

    elif regressor == 'mlp':
        params = {
            # feedforward network trained via backpropagation – here only using a single hidden layer
            'hidden_layer_sizes': [i for i in range(1, 12)],  # size of hidden layer [(1,), (2,), ..., (12,)]
            'activation': ['relu'],  # rectified linear unit
            'solver': ['adam', 'lbfgs'],  # ADAM: A Method for Stochastic Optimization , or Limited-memory BFGS
            'learning_rate': ['constant'],  # learning rate given by ‘learning_rate_init’
            'learning_rate_init': [0.001, 0.01, 0.1],  # only used when solver=’sgd’ or ‘adam’
            'max_iter': [1000, 200],  # for stochastic solvers (‘sgd’, ‘adam’) determines epochs
            'random_state': [42]
        }
        regressor_ = GridSearchCV(MLPRegressor(), param_grid=params, iid=False, cv=5)

    else:
        raise SystemError("Did not find specified regression model as valid option. See '--help' for valid "
                          "regression model options.")

    regressor_.fit(x_learn, y_learn)  # fit model

    if regressor != 'pls':      # take best parameters for the regressor and the AAindex
        best_params = regressor_.best_params_

    y_pred = []
    for y_p in regressor_.predict(x_valid):  # predict validation entries with fitted model
        y_pred.append(float(y_p))

    r2 = r2_score(y_valid, y_pred)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    nrmse = rmse / np.std(y_valid, ddof=1)
    # ranks for Spearman's rank correlation
    y_val_rank = np.array(y_valid).argsort().argsort()
    y_pred_rank = np.array(y_pred).argsort().argsort()
    with warnings.catch_warnings():  # catching RunTime warning when there's no variance in an array, e.g. [2, 2, 2, 2]
        warnings.simplefilter("ignore")  # which would mean divide by zero
        pearson_r = np.corrcoef(y_valid, y_pred)[0][1]
        spearman_rho = np.corrcoef(y_val_rank, y_pred_rank)[0][1]

    return r2, rmse, nrmse, pearson_r, spearman_rho, regressor, best_params
示例#54
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Using linear regression model
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(X_train, y_train)

# Using quadratic regression with 2 polynomial features
quad1_model = make_pipeline(PolynomialFeatures(2), Ridge())
quad1_model.fit(X_train, y_train)

quad2_model = make_pipeline(PolynomialFeatures(3), Ridge())
quad2_model.fit(X_train, y_train)

# Using SVM radias basis function (RBF) model
rbf_model = SVR(kernel='rbf', C=1e3, gamma=0.1)
rbf_model.fit(X_train, y_train)

# KNN Regression
knn_model = KNeighborsRegressor(n_neighbors=2)
knn_model.fit(X_train, y_train)

# Get confidence scores for each model
lr_confidence = lr_model.score(X_test, y_test)
quad1_confidence = quad1_model.score(X_test, y_test)
quad2_confidence = quad2_model.score(X_test, y_test)
rbf_confidence = rbf_model.score(X_test, y_test)
knn_confidence = knn_model.score(X_test, y_test)

# print confidence scores for each model--Quad 2 performs best
print("lr confidence: ", lr_confidence)
示例#55
0
                       random_state=rand_st)
    scores = cross_validate(estimator=rgr,
                            X=data_np,
                            y=target_np,
                            scoring=scorers,
                            cv=5)

    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']
                              ])  #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print("Neural Network RMSE:: %0.2f (+/- %0.2f)" %
          ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
    print("Neural Network Expl Var: %0.2f (+/- %0.2f)" %
          ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time() - start_ts)

if norm_features == 1:
    #SciKit SVM - Cross Val
    start_ts = time.time()
    rgr = SVR(kernel='linear', gamma=0.1, C=1.0)
    scores = cross_validate(rgr, data_np, target_np, scoring=scorers, cv=5)

    scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']
                              ])  #Turns negative MSE scores into RMSE
    scores_Expl_Var = scores['test_expl_var']
    print("SVM RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()),
                                            (scores_RMSE.std() * 2)))
    print("SVM Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()),
                                               (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time() - start_ts)
示例#56
0
def BOP():

    big_score = 0
    big_name = ""
    big_y_pred = 0
    big_error = 0

    model = LinearRegression()
    name = "Linear Regression"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y_pred, y2)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred
        big_error = error

    model = KNeighborsRegressor()
    name = "KNN Regression"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred
        big_error = error

    model = DecisionTreeRegressor()
    name = "Decision Tree"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred
        big_error = error

    model = SVR()
    name = "SVR"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred
        big_error = error

    model = RandomForestRegressor()
    name = "Random Forest"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred
        big_error = error

    model = GradientBoostingRegressor()
    name = "Gradient Booster"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred
        big_error = error

    model = ExtraTreesRegressor()
    name = "Extra Trees Regressor"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred3
        big_error = error

    model = AdaBoostRegressor()
    name = "AdaBoost Regressor"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)

    if (score > big_score and score is not 1):
        big_score = score
        big_name = name
        big_y_pred = y_pred
        big_error = error

    plotgraph(big_y_pred, big_name, error, big_score)
示例#57
0
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle

# Load housing data
data = datasets.load_boston()

# Shuffle the data
X, y = shuffle(data.data, data.target, random_state=7)

# Split the data into training and testing datasets
num_training = int(0.8 * len(X))
X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]

# Create Support Vector Regression model
sv_regressor = SVR(kernel='linear', C=1.0, epsilon=0.1)
# Train Support Vector Regressor
sv_regressor.fit(X_train, y_train)

# Evaluate performance of Support Vector Regressor
y_test_pred = sv_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
evs = explained_variance_score(y_test, y_test_pred)
print("nn #### Performance #### ")
print("Mean squared error =", round(mse, 2))
print("Explained variance score =", round(evs, 2))

# Test the regressor on test datapoint
test_data = [
    3.7, 0, 18.4, 1, 0.87, 5.95, 91, 2.5052, 26, 666, 20.2, 351.34, 15.27
]
示例#58
0
def compare1():

    f1, axes = subplots(2, 3)

    model = LinearRegression()
    name = "Linear Regression"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y_pred, y2)
    error = round(error, 6)
    error = str(error)
    score = round(score, 6)
    score = str(score)
    axes[0, 0].plot(y_pred)
    axes[0, 0].set_title(name + ": error = " + error + "   MSE = " + score)

    model = joblib.load('lasso.pkl')
    y_pred = model.predict(x2)
    name = "Lasso"
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)
    error = round(error, 6)
    error = str(error)
    score = round(score, 6)
    score = str(score)
    axes[0, 1].plot(y_pred)
    axes[0, 1].set_title(name + ": error = " + error + "   MSE = " + score)

    model = KNeighborsRegressor()
    name = "KNN Regression"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)
    error = round(error, 6)
    error = str(error)
    score = round(score, 6)
    score = str(score)
    axes[0, 2].plot(y_pred)
    axes[0, 2].set_title(name + ": error = " + error + "   MSE = " + score)

    model = DecisionTreeRegressor()
    name = "Decision Tree"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)
    error = round(error, 6)
    error = str(error)
    score = round(score, 6)
    score = str(score)
    axes[1, 0].plot(y_pred)
    axes[1, 0].set_title(name + ": error = " + error + "   MSE = " + score)

    model = SVR()
    name = "SVR"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)
    error = round(error, 6)
    error = str(error)
    score = round(score, 6)
    score = str(score)
    axes[1, 1].plot(y_pred)
    axes[1, 1].set_title(name + ": error = " + error + "   MSE = " + score)

    model = ElasticNet()
    name = "Elastic Net"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)
    error = round(error, 6)
    error = str(error)
    score = round(score, 6)
    score = str(score)
    axes[1, 2].plot(y_pred)
    axes[1, 2].set_title(name + ": error = " + error + "   MSE = " + score)

    f1.show()
示例#59
0
import matplotlib.pyplot as plt

dataset = pd.read_csv('Position_Salaries.csv')

x = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
y = np.reshape(y, (-1, 1))

from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x)

#we cannot apply StandardScaler to a 1D array , but we can use scale
#from sklearn.preprocessing import scale
#y = scale(y)
y = sc_y.fit_transform(y)

plt.scatter(x, y, color='red')
plt.plot(x, y, color='red')
plt.show()

from sklearn.svm import SVR

regressor = SVR(kernel='rbf')
regressor.fit(x, y)

plt.scatter(x, y, color='red')
plt.plot(x, regressor.predict(x), color='blue')
plt.show()
示例#60
-19
def analyze(data, label, num_folds):
    # Partition data into folds
    n = len(data) // num_folds
    data_folds = [data[i:i+n] for i in range(0, len(data), n)]
    label_folds = [label[i:i+n] for i in range(0, len(label), n)]

    lin_reg_error = 0
    
    cs = [4**c for c in range(-10, 0, 1)]
    svm_error = [0] * len(cs)
    svm_std = [0] * len(cs)
    # for i in range(0, num_folds):
    #     test_data = data_folds[i]
    #     test_label = label_folds[i]
    #     train_data = []
    #     train_label = []
    #     for j in range(num_folds):
    #         if i != j:
    #             train_data += data_folds[j]
    #             train_label += label_folds[j]

    # model = linear_model.LinearRegression()
    # model.fit(data, label)
    # return model
        # lin_reg_error += np.mean(abs(model.predict(test_data) - test_label))
        #
        # for i2 in range(len(cs)):
        #     svm_classifier = SVR(gamma=cs[i2])
        #     svm_classifier.fit(train_data, train_label)
        #     svm_error[i2] += np.mean(abs(svm_classifier.predict(test_data) - test_label))
        #     svm_std[i2] += np.std(abs(svm_classifier.predict(test_data) - test_label))

    svm_c = SVR(gamma=4**-7)
    svm_c.fit(data, label)
    return svm_c