def separate_coord_nu_svr(train_sequences_lat=train_sequences, train_sequences_long=train_sequences, val_sequences_lat=val_sequences, val_sequences_long=val_sequences, training_latitudes=training_latitudes, training_longitudes=training_longitudes, val_latitudes=val_latitudes, val_longitudes=val_longitudes, test_sequences=test_sequences, sub_name='separate_coord_nu_svr'): # separate svr for each coordinate svr_lat = svm.NuSVR(C=0.1, nu=0.3, verbose=10) svr_lat.fit(train_sequences_lat, training_latitudes) mse_lat = get_mse(svr_lat, val_sequences_lat, val_latitudes, is_multioutput=False) svr_long = svm.NuSVR(C=0.001, nu=0.7, verbose=10) svr_long.fit(train_sequences_long, training_longitudes) mse_long = get_mse(svr_long, val_sequences_long, val_longitudes, is_multioutput=False) print(mse_lat) print(mse_long) print((mse_lat + mse_long) / 2) return svr_lat, svr_long
def separate_coord_grid_search(): # gridsearch on 2 single output models from sklearn import metrics mse = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) Cs = [0.1, 0.01, 0.001] nus = [0.3, 0.7, 0.9, 1] params = {'C': Cs, 'nu': nus} from sklearn.model_selection import GridSearchCV svr_lat_grid = GridSearchCV(svm.NuSVR(), params, cv=5, scoring=mse, n_jobs=-1, verbose=10) svr_lat_grid.fit(train_sequences, training_latitudes) print(svr_lat_grid.best_params_) svr_long_grid = GridSearchCV(svm.NuSVR(), params, cv=5, scoring=mse, n_jobs=-1, verbose=10) svr_long_grid.fit(train_sequences, training_longitudes) print(svr_long_grid.best_params_)
def test_svr(): """ Test Support Vector Regression """ diabetes = datasets.load_diabetes() for clf in (svm.NuSVR(kernel='linear', nu=.4, C=1.0), svm.NuSVR(kernel='linear', nu=.4, C=10.), svm.SVR(kernel='linear', C=10.)): clf.fit(diabetes.data, diabetes.target) assert_greater(clf.score(diabetes.data, diabetes.target), 0.02)
def test_SVR(): """ Test Support Vector Regression """ diabetes = datasets.load_diabetes() for clf in (svm.NuSVR(kernel='linear', nu=.4), svm.NuSVR(kernel='linear', nu=.4, C=10.), svm.SVR(kernel='linear', C=10.), svm.sparse.NuSVR(kernel='linear', nu=.4), svm.sparse.NuSVR(kernel='linear', nu=.4, C=10.), svm.sparse.SVR(kernel='linear', C=10.)): clf.fit(diabetes.data, diabetes.target) assert clf.score(diabetes.data, diabetes.target) > 0.02
def test_svr(): # Test Support Vector Regression diabetes = datasets.load_diabetes() for clf in (svm.NuSVR(kernel='linear', nu=.4, C=1.0), svm.NuSVR(kernel='linear', nu=.4, C=10.), svm.SVR(kernel='linear', C=10.), svm.LinearSVR(C=10.), svm.LinearSVR(C=10.)): clf.fit(diabetes.data, diabetes.target) assert clf.score(diabetes.data, diabetes.target) > 0.02 # non-regression test; previously, BaseLibSVM would check that # len(np.unique(y)) < 2, which must only be done for SVC svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data))) svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data)))
def trainFixed(): ''' train a machine learner based on data from some fixed parameter point. save to fixed.pkl ''' print "Entering train fixed" trainAndTarget = np.loadtxt('traindata.dat') traindata = trainAndTarget[:, 0:2] targetdata = trainAndTarget[:, 2] massPoints = np.unique(traindata[:, 1]) chunk = len(traindata) / len(massPoints) / 2 shift = len(traindata) / 2 #plot for fixed mu=0 training print "training fixed" clf = svm.NuSVR() reducedtrain = np.concatenate( (traindata[4 * chunk:5 * chunk, 0], traindata[4 * chunk + shift:5 * chunk + shift, 0])) reducedtarget = np.concatenate( (targetdata[4 * chunk:5 * chunk], targetdata[4 * chunk + shift:5 * chunk + shift])) clf.fit(reducedtrain.reshape((len(reducedtrain), 1)), reducedtarget) joblib.dump(clf, 'fixed.pkl')
def wardCV(data, labels, cut_level, connect): '''calculate cross-validated amount of ward-clusters''' #loop for list accuracies = np.zeros(len(cut_level)) for i in cut_level: #reduce to set amount of clusters agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=i) cross = sklcv.KFold(n=len(labels), n_folds=len(labels)) pred_vec = np.zeros_like(labels) for train_i, test_i in cross: use_train = agglo.fit_transform(data[train_i]) use_test = agglo.transform(data[test_i]) scaler = sklpre.StandardScaler() use_train = scaler.fit_transform(use_train) use_test = scaler.transform(use_test) model = sklsvm.NuSVR(kernel='linear', nu=1, C=100) model.fit(use_train, labels[train_i]) pr = model.predict(use_test) pred_vec[test_i] = pr #save accuracy accuracies[cut_level == i], _ = ss.spearmanr(pred_vec, labels) #based on loo-accuracy, select the optimal number of features #TODO -smooth this? accuracies = ssig.medfilt(accuracies) best_model = cut_level[accuracies.argmax()] return best_model
def _estimate_model(self): """Estimates SVR model. Returns ------- model : sklearn LinearSVR or SVR model or grid search cv object Fitted object. """ if self.kernel == 'linear': self.underlying = svm.LinearSVR(**self.kwargs) else: if self.type == 'eps': self.underlying = svm.SVR(kernel=self.kernel, **self.kwargs) elif self.type == 'nu': self.underlying = svm.NuSVR(kernel=self.kernel, **self.kwargs) else: raise NotImplementedError( 'Type not implemented. Choices are eps or nu.') if self.cv_folds is not None: model = model_selection.GridSearchCV(self.underlying, self.parameters, cv=self.cv_folds, scoring=self.score) else: model = self.underlying model.fit(self.x_train, self.y_train) return model
def test_c_samples_scaling(): """Test C scaling by n_samples """ X = iris.data[iris.target != 2] y = iris.target[iris.target != 2] X2 = np.r_[X, X] y2 = np.r_[y, y] clfs = [ svm.SVC(tol=1e-6, kernel='linear', C=0.1), svm.SVR(tol=1e-6, kernel='linear', C=100), svm.LinearSVC(tol=1e-6, C=0.1), linear_model.LogisticRegression(penalty='l1', tol=1e-6, C=100), linear_model.LogisticRegression(penalty='l2', tol=1e-6), svm.NuSVR(tol=1e-6, kernel='linear') ] for clf in clfs: clf.set_params(scale_C=False) coef_ = clf.fit(X, y).coef_ coef2_ = clf.fit(X2, y2).coef_ error_no_scale = linalg.norm(coef2_ - coef_) / linalg.norm(coef_) assert_true(error_no_scale > 1e-3) clf.set_params(scale_C=True) coef_ = clf.fit(X, y).coef_ coef2_ = clf.fit(X2, y2).coef_ error_with_scale = linalg.norm(coef2_ - coef_) / linalg.norm(coef_) assert_true(error_with_scale < 1e-5)
def multioutput_model(): # labels for multioutput model train_labels = np.zeros((training_latitudes.shape[0], 2)) train_labels[:, 0] = np.array(training_latitudes) train_labels[:, 1] = np.array(training_longitudes) #labels for multioutput models val_labels = np.zeros((val_latitudes.shape[0], 2)) val_labels[:, 0] = np.array(val_latitudes) val_labels[:, 1] = np.array(val_longitudes) # multi output model from sklearn.multioutput import MultiOutputRegressor from sklearn.linear_model import BayesianRidge from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestRegressor from sklearn.kernel_ridge import KernelRidge # define and fit multi_output = MultiOutputRegressor(svm.NuSVR(), n_jobs=-1) multi_output.fit(train_sequences, train_labels) # get error from sklearn import metrics predictions = multi_output.predict(val_sequences) mse_1 = metrics.mean_squared_error(val_labels[:, 0], predictions[:, 0]) mse_2 = metrics.mean_squared_error(val_labels[:, 1], predictions[:, 1]) print(mse_1) print(mse_2) return multi_output
def main(args): train_file = os.path.join(args.data_dir, 'train.csv') train_df = pd.read_csv(train_file) train_df = clean_data(train_df) print(train_df.info()) feature_columns = [ 'MSSubClass', 'LotArea', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt' ] features = train_df[feature_columns].values targets = train_df['SalePrice'].values clf = svm.NuSVR() clf.fit(features, targets) test_file = os.path.join(args.data_dir, 'test.csv') test_df = pd.read_csv(test_file) test_df = clean_data(test_df) print(test_df.info()) features = test_df[feature_columns].values predicts = clf.predict(features) ids = test_df['Id'].values with open('/tmp/kaggle_submit.csv', 'w') as fileobj: writer = csv.writer(fileobj) writer.writerow(['Id', 'SalePrice']) for id, price in zip(ids, predicts): writer.writerow([id, price])
def main(): id2year2stats = load_files( {year: 'fant%d.csv' % year for year in xrange(2008, 2013)}, SPECIAL_CASE_TRADES) def id_to_useful_name(id): year2stats = id2year2stats[id] any_year = year2stats[year2stats.keys()[0]] return (any_year['Name'], any_year['Tm'], any_year['FantasyFantPos']) current_players = set(id for id in id2year2stats if BASE_YEAR - 1 in id2year2stats[id]) matrix, identifiers, features = construct_feature_matrix(id2year2stats) id2name = { ident[ID]: id_to_useful_name(ident[ID]) for ident in identifiers } from sklearn import linear_model from sklearn import ensemble from sklearn import svm seed = randint(0, 2**32 - 1) for model in [ linear_model.LinearRegression(), linear_model.Ridge(), ensemble.RandomForestRegressor(), ensemble.ExtraTreesRegressor(), ensemble.AdaBoostRegressor(), ensemble.GradientBoostingRegressor(), svm.SVR(), svm.NuSVR(), ]: print str(model).split('(')[0] cross_validate(matrix, identifiers, features, id2name, model, n_folds=10, seed=seed) print model = ensemble.RandomForestRegressor() current_predictions, current_ids = \ predict_current_year(matrix, identifiers, features, id2name, model) current_predictions, current_ids = zip( *[(pred, ident) for pred, ident in zip(current_predictions, current_ids) if ident[ID] in current_players]) current_predicted_ranks = position_ranking_lists(current_ids, current_predictions, id2name) dump_predictions(current_predicted_ranks) return
def test_sk_NuSVR(): print("Testing sklearn, NuSVR...") mod = svm.NuSVR() X, y = iris_data mod.fit(X, y) docs = {'name': "NuSVR test"} fv = X[0, :] upload(mod, fv, docs)
def test_unfitted(): X = "foo!" # input validation not required when SVM not fitted clf = svm.SVC() with pytest.raises(Exception, match=r".*\bSVC\b.*\bnot\b.*\bfitted\b"): clf.predict(X) clf = svm.NuSVR() with pytest.raises(Exception, match=r".*\bNuSVR\b.*\bnot\b.*\bfitted\b"): clf.predict(X)
def __init__(self, provider): self.provider = provider self.mult = self.provider.multiplier input = [] target = [] for d in self.provider.getLearnData(): input.append(d[0]) target.append(d[1][0] / self.mult) self.regressor = svm.NuSVR() self.regressor.fit(input, target)
def test_unfitted(): X = "foo!" # input validation not required when SVM not fitted clf = svm.SVC(gamma="scale") assert_raises_regexp(Exception, r".*\bSVC\b.*\bnot\b.*\bfitted\b", clf.predict, X) clf = svm.NuSVR(gamma='scale') assert_raises_regexp(Exception, r".*\bNuSVR\b.*\bnot\b.*\bfitted\b", clf.predict, X)
def validation(data, target, constant): score = 0 regressor = svm.NuSVR(kernel="poly") param_grid = { 'C': np.linspace(20.0, 40.0, 10), 'nu': np.linspace(0.0001, 1, 5) } grid_search = sklearn.grid_search.GridSearchCV( regressor, param_grid, scoring=sklearn.metrics.make_scorer(sklearn.metrics.mean_squared_error, greater_is_better=False), cv=5, n_jobs=-1) grid_search.fit(data, target) clf = grid_search.best_estimator_ print(clf) chunk_size = len(data) / CVSize for x in range(CVSize): # These describe where to cut to get our crossdat first_step = x * chunk_size second_step = (x + 1) * chunk_size # Get the data parts we train on cross_data = np.vstack((data[:first_step], data[second_step:])) cross_target = np.append(target[:first_step], target[second_step:]) # fit and save the coef clf.fit(cross_data, cross_target) # Find mean squared error and print it sample_data = data[first_step:second_step] sample_target = target[first_step:second_step] # Get scores for our model pred = clf.predict(sample_data) RMSE = mean_squared_error(sample_target, pred)**0.5 score += RMSE score = score / CVSize print("Cross-Validation RMSE: {} ".format(score)) # Get global score clf.fit(data, target) pred = clf.predict(data) RMSE = mean_squared_error(target, pred)**0.5 print("RMSE on whole dataset {}".format(RMSE)) return score
def test_svr_coef_sign(): # Test that SVR(kernel="linear") has coef_ with the right sign. # Non-regression test for #2933. X = np.random.RandomState(21).randn(10, 3) y = np.random.RandomState(12).randn(10) for svr in [svm.SVR(kernel='linear'), svm.NuSVR(kernel='linear'), svm.LinearSVR()]: svr.fit(X, y) assert_array_almost_equal(svr.predict(X), np.dot(X, svr.coef_.ravel()) + svr.intercept_)
def SelectModel(regressor): if (regressor == 'svr'): model = svm.SVR() elif (regressor == 'nusvr'): model = svm.NuSVR() elif (regressor == 'linear'): model = LinearRegression() elif (regressor == 'RF'): model = RandomForestRegressor(n_estimators=1500, n_jobs=-1) return model
def manualGridSearch(train, test, dict_names, y, X, Y, tuning_parameters): gridSearch = defaultdict(dict) for i in range(len(X)): nX = X[i] nY = Y[i] for k in tuning_parameters['kernel']: d = 3 print(k) listK = [] if k =='poly': d = int(k[-1]) k = 'poly' for g in tuning_parameters['gamma']: print(" "+str(g)) col = [] if 'poly' in k and type(g) != float: for c in tuning_parameters['C']: for name in dict_names: col.append(0) else: for c in tuning_parameters['C']: if 'poly' in k: k = 'poly' print(" "+str(c)) for name in dict_names: print(" "+name) X_train, X_test, y_train, y_test = getTrainTest(train, test, name, nX, nY, y) if y==1 or y==2: clf = svm.NuSVR(kernel=k, C=c, gamma=g, degree=d) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = y_pred.astype(np.float) y_test = y_test.astype(np.float) rmse = np.sqrt((np.square(y_pred - y_test)).mean()) col.append(rmse) elif y==0 or y==3: clf = svm.SVC(kernel=k, C=c, gamma=g, degree=d) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = y_pred.astype(np.int) y_test = y_test.astype(np.int) precision = metrics.precision_score(y_true=y_test,y_pred=y_pred,pos_label=0) col.append(precision) listK.append(col) A = np.column_stack(listK) row_names = len(tuning_parameters['C'])*dict_names col_names = tuning_parameters['gamma'] A = pd.DataFrame(A) A.index = row_names A.columns = col_names gridSearch[k][nX] = A return gridSearch
def test_immutable_coef_property(): # Check that primal coef modification are not silently ignored svms = [ svm.SVC(kernel='linear').fit(iris.data, iris.target), svm.NuSVC(kernel='linear').fit(iris.data, iris.target), svm.SVR(kernel='linear').fit(iris.data, iris.target), svm.NuSVR(kernel='linear').fit(iris.data, iris.target), svm.OneClassSVM(kernel='linear').fit(iris.data), ] for clf in svms: assert_raises(AttributeError, clf.__setattr__, 'coef_', np.arange(3)) assert_raises((RuntimeError, ValueError), clf.coef_.__setitem__, (0, 0), 0)
def test_immutable_coef_property(): # Check that primal coef modification are not silently ignored svms = [ svm.SVC(kernel="linear").fit(iris.data, iris.target), svm.NuSVC(kernel="linear").fit(iris.data, iris.target), svm.SVR(kernel="linear").fit(iris.data, iris.target), svm.NuSVR(kernel="linear").fit(iris.data, iris.target), svm.OneClassSVM(kernel="linear").fit(iris.data), ] for clf in svms: with pytest.raises(AttributeError): clf.__setattr__("coef_", np.arange(3)) with pytest.raises((RuntimeError, ValueError)): clf.coef_.__setitem__((0, 0), 0)
def nu_svr_example(): n_samples, n_features = 10, 5 np.random.seed(0) X, Y = np.random.randn(n_samples, n_features), np.random.randn(n_samples) #iris = datasets.load_iris() #X, Y = iris.data, iris.target regressor = svm.NuSVR(nu=0.5, kernel='rbf', degree=3, max_iter=-1) regressor.fit(X, Y) X_test = np.random.randn(5, n_features) #X_test = X print('Prediction =', regressor.predict(X_test)) print('Score =', regressor.score(X, Y))
def SelectModel(regressor): if(regressor == 'svr'): model = svm.SVR() elif (regressor == 'nusvr'): model = svm.NuSVR() elif (regressor == 'Gausian'): model = GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1,random_start=100) elif (regressor == 'Nearest_Neighbors_uniform'): model = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform') elif (regressor == 'Nearest_Neighbors_distance'): model = neighbors.KNeighborsRegressor(n_neighbors, weights='distance') return model
def normalCV_NuSVR_cpu(X, Y, n_folds, c, kernel): svc = svm.NuSVR(kernel=kernel, C=c, verbose=0, max_iter=100000) kf = KFold(n_splits=n_folds, random_state=None) array_preds = np.zeros((len(Y),)) list_trues = np.zeros((len(Y),)) for train_index, test_index in kf.split(X=X): x_train, x_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] svc.fit(x_train, y_train) pred = svc.predict(x_test) array_preds[test_index] = pred list_trues[test_index] = y_test return array_preds, list_trues
def loocv_NuSVR_cpu(X, Y, c, kernel): svc = svm.NuSVR(kernel=kernel, C=c, verbose=0, max_iter=100000) loo = LeaveOneOut() array_preds = np.zeros((len(Y),)) list_trues = np.zeros((len(Y),)) for train_index, test_index in loo.split(X=X): x_train, x_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] svc.fit(x_train, y_train) pred = svc.predict(x_test) array_preds[test_index] = pred list_trues[test_index] = y_test return array_preds, list_trues
def trainAdaptive(): ''' train a machine learner on parametrized data examples. save to adaptive.pkl ''' print "Entering train adaptive" trainAndTarget = np.loadtxt('traindata.dat') traindata = trainAndTarget[:, 0:2] targetdata = trainAndTarget[:, 2] massPoints = np.unique(traindata[:, 1]) chunk = len(traindata) / len(massPoints) / 2 shift = len(traindata) / 2 print "training adaptive" clf = svm.NuSVR() clf.fit(traindata, targetdata) joblib.dump(clf, 'adaptive.pkl')
def model_svm(s, t, s_, t_, flagLinear): # bad r2 if flagLinear == 0: #http://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVR.html#sklearn.svm.NuSVR clf = sksvm.NuSVR(nu=0.5, C=1.0, kernel='rbf', degree=5, gamma='auto', coef0=0.0, shrinking=True, \ tol=0.001, cache_size=200, verbose=False, max_iter=1000) clf.fit(s, t) else: # http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html#sklearn.svm.LinearSVR # this loss function is L1, thus insensitive to outliers clf = sksvm.LinearSVR(epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', \ fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, \ random_state=None, max_iter=1000) clf.fit(s, t) print 'coeffs = ', clf.coef_, ' intercept = ', clf.intercept_ r2_train = clf.score(s, t) r2_test = clf.score(s_, t_) print 'r2_train=', r2_train, ' r2_test=', r2_test
def test_immutable_coef_property(): """Check that primal coef modification are not silently ignored""" svms = [ svm.SVC(kernel='linear').fit(iris.data, iris.target), svm.NuSVC(kernel='linear').fit(iris.data, iris.target), svm.SVR(kernel='linear').fit(iris.data, iris.target), svm.NuSVR(kernel='linear').fit(iris.data, iris.target), svm.OneClassSVM(kernel='linear').fit(iris.data), svm.sparse.SVC(kernel='linear').fit(iris.data, iris.target), svm.sparse.NuSVC(kernel='linear').fit(iris.data, iris.target), svm.sparse.SVR(kernel='linear').fit(iris.data, iris.target), svm.sparse.NuSVR(kernel='linear').fit(iris.data, iris.target), svm.LinearSVC().fit(iris.data, iris.target), linear_model.LogisticRegression().fit(iris.data, iris.target), ] for clf in svms: assert_raises(AttributeError, clf.__setattr__, 'coef_', np.arange(3)) assert_raises(RuntimeError, clf.coef_.__setitem__, (0, 0), 0)
def svrmodel(self, testlen, ntrain, kernel='linear', batch=10000): hsmadata = self.hsmadata dates = pd.Series(hsmadata['date'].unique()).sort_values() dates.index = range(0, len(dates)) ntest = len(dates) // testlen hsma = pd.DataFrame() for i in range(ntrain, ntest): traindata = hsmadata[ (hsmadata['date'] >= dates[(i - ntrain) * testlen]) & (hsmadata['date'] < dates[i * testlen - self.day])].copy() testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen]) & ( hsmadata['date'] < dates[(i + 1) * testlen])].copy() traindata.index = range(0, traindata.shape[0]) testdata['predratio'] = 0 traindata = traindata.iloc[:, 2:] traindatax = traindata.drop(['closeratio'], 1) traindatay = traindata['closeratio'] testdatax = testdata[traindatax.columns] scaler = preprocessing.StandardScaler().fit(traindatax) traindatas = scaler.transform(traindatax) testdatas = scaler.transform(testdatax) n1 = traindatas.shape[0] nbatch = n1 // batch for j in range(0, nbatch): traindataxb = pd.DataFrame( traindatas).ix[range(j, n1, nbatch), ] traindatayb = traindata.ix[range(j, n1, nbatch), 'closeratio'] svrmodel = svm.NuSVR(kernel=kernel) svrmodel.fit(traindataxb, traindatayb) testdata['predratio'] = testdata[ 'predratio'] + svrmodel.predict(testdatas) testdata['predratio'] = testdata['predratio'] / nbatch hsma = pd.concat([hsma, testdata], ignore_index=True) return (hsma)