train_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(train_datasets[p]), index_col=0) sub = np.load('/home/fotis/dev_projects/explanation_framework/input/Subqueries/{0}'.format(sub_datasets[p])) logger.info('Finished loading\nCommencing Evaluation') aggregates = ['count','sum_','avg'] agg_map = {'count' :4, 'sum_':5, 'avg':6} for agg in aggregates: logger.info("Evaluating Aggregates : {0}".format(agg)) X_train = train_df[['x','y','x_range','y_range']].values y_train = train_df[agg].values sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) #Training Models logger.info("Model Training Initiation\n=====================") mars_ = Earth(feature_importance_type='gcv',) vigilance_x = np.linspace(0.01, 3, Config.vigilance_x_frequency) for sens_x in vigilance_x: lsnr = PR(mars_,vigil_x=sens_x) lsnr.fit(X_train,y_train) logger.info("Accuracy Evaluation on Test set with vigil_x={0}\n=====================".format(sens_x)) for i in range(1000): #Obtain query from test-set dataset = p printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50) q = test_df.iloc[i].values[:4].reshape(1,-1) q = sc.transform(q)
y = train['transactionRevenue'] X =train.drop(["fullVisitorId","transactionRevenue"],axis=1) from sklearn.linear_model import LinearRegression from sklearn.linear_model import Lasso from sklearn.ensemble import RandomForestRegressor from pyearth import Earth regression_OLS=LinearRegression() regression_Lasso=Lasso(precompute=True,max_iter=10000,alpha=3.0) regression_RF=RandomForestRegressor(max_leaf_nodes=100,max_features=100) #regression_SVR=SVR(kernel='rbf', C=1e3, gamma=0.1) regression_spline = Earth() from sklearn import model_selection X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) print('training data has %d observation with %d features'% X_train.shape) print('test data has %d observation with %d features'% X_test.shape) from sklearn.metrics import mean_squared_error,explained_variance_score model_names = ['Linear Regression OLS','Linear Regression Lasso','Random Forest','Spline_regression'] model_list = [regression_OLS,regression_Lasso, regression_RF,regression_spline] count = 0 for regression in model_list: model = regression.fit(X_train,y_train) y_preds = model.predict(X_test)
df = pd.read_csv(dataset, sep='\t') df = pd.read_table(dataset) gt_mapping = {'0/0': 0, '0/1': 1, '1/1': 2} df['GT_GATK'] = df['GT_GATK'].map(gt_mapping) df['GT_Varscan'] = df['GT_Varscan'].map(gt_mapping) df['GT_Freebayes'] = df['GT_Freebayes'].map(gt_mapping) X = df.values[:100, 5:] X = set_missing_values(X) #print df.columns[12] y = np.random.randint(2, size=(int(np.shape(X)[0]), )) #print X #print y earth_classifier = Pipeline([('earth', Earth(allow_missing=True)), ('logistic', LogisticRegression())]) #earth_classifier = Pipeline([('earth', Earth(allow_missing=True)), # ('logistic', RandomForestClassifier())]) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) ec = earth_classifier.fit(X_train, y_train) y_hat = earth_classifier.predict(X_test)
clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outElasticNet.csv') regr_2.fit(X_train, y_train) y_eval = regr_2.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions' ]).to_csv('outAdaBoostRegressor.csv') clf = linear_model.Lars(n_nonzero_coefs=1) clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outLARS.csv') """ clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng) clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outAdaBoostRegressor.csv') """ from pyearth import Earth clf = Earth() clf.fit(X_train, y_train) y_eval = clf.predict(X) prediction = pd.DataFrame(y_eval, columns=['predictions']).to_csv('outMARS.csv')
test = pd.read_csv('../data/modeltest.csv',index_col=0) label = train['Response'].values featextra= pd.read_csv('../feat/improve.csv',index_col=0) train = pd.concat([train,featextra.loc[train.index]],axis=1) test = pd.concat([test,featextra.loc[test.index]],axis=1) featextra= pd.read_csv('../feat/duplicate.csv',index_col=0) train = pd.concat([train,featextra.loc[train.index]],axis=1) test = pd.concat([test,featextra.loc[test.index]],axis=1) feat = train.columns.drop('Response',1) #Build an Earth model with a logisticregression pipeline earth_pipe = Pipeline([('earth',Earth(use_fast=True,allow_missing=True,penalty=0.5,max_degree=3)),('log',LogisticRegression())]) earth_pipe.fit(train[feat],label) #Parameter tuning #param_grid = {'earth__penalty': np.arange(1,11,2),'earth__max_degree': range(1,4)} # #gs1 = GridSearchCV(earth_pipe,param_grid,n_jobs=1,pre_dispatch=1,cv=StratifiedKFold(label, n_folds=5, shuffle=True),scoring='log_loss',verbose=2) # # #gs1.fit(train[feat],label) # #print gs1.best_params_ #print gs1.best_score_ # ##----------------------------------------------------------
def translation_correction(cell_mesh, cell_mesh_2, buffer_cell,\ x_pos, y_pos, z_pos, x_pos_new, y_pos_new, z_pos_new, closest_no_conflict, directory ): x_min = np.min([np.min(cell_mesh[:,0]),np.min(cell_mesh_2[:,0])]) - buffer_cell x_max = np.max([np.max(cell_mesh[:,0]),np.max(cell_mesh_2[:,0])]) + buffer_cell y_min = np.min([np.min(cell_mesh[:,1]),np.min(cell_mesh_2[:,1])]) - buffer_cell y_max = np.max([np.max(cell_mesh[:,1]),np.max(cell_mesh_2[:,1])]) + buffer_cell z_min = np.min([np.min(cell_mesh[:,2]),np.min(cell_mesh_2[:,2])]) - buffer_cell z_max = np.max([np.max(cell_mesh[:,2]),np.max(cell_mesh_2[:,2])]) + buffer_cell num_pts = len(x_pos) X = []; Y = []; Z = []; U = []; V = []; W = [] for kk in range(0,num_pts): idx = closest_no_conflict[kk] if idx < len(closest_no_conflict): U.append(x_pos_new[idx] - x_pos[kk]) V.append(y_pos_new[idx] - y_pos[kk]) W.append(z_pos_new[idx] - z_pos[kk]) X.append(x_pos_new[idx]); Y.append(y_pos_new[idx]); Z.append(z_pos_new[idx]) # --> limit to points that aren't too close to the cell X_safe = []; Y_safe = []; Z_safe = []; U_safe = []; V_safe = []; W_safe = [] num_pts = len(U) for kk in range(0,num_pts): x_out = X[kk] < x_min or X[kk] > x_max y_out = Y[kk] < y_min or Y[kk] > y_max z_out = Z[kk] < z_min or Z[kk] > z_max if x_out or y_out or z_out: X_safe.append(X[kk]) Y_safe.append(Y[kk]) Z_safe.append(Z[kk]) U_safe.append(U[kk]) V_safe.append(V[kk]) W_safe.append(W[kk]) X_safe = np.asarray(X_safe); Y_safe = np.asarray(Y_safe); Z_safe = np.asarray(Z_safe) U_safe = np.asarray(U_safe); V_safe = np.asarray(V_safe); W_safe = np.asarray(W_safe) # --> fit MARS models model_U = Earth(max_degree=2,max_terms=10) model_U.fit(Z_safe,U_safe) model_V = Earth(max_degree=2,max_terms=10) model_V.fit(Z_safe,V_safe) model_W = Earth(max_degree=2,max_terms=10) model_W.fit(Z_safe,W_safe) # --> re-define Z pred_U = model_U.predict(z_pos_new) pred_V = model_V.predict(z_pos_new) pred_W = model_W.predict(z_pos_new) # --> correct new bead positions for kk in range(0,len(x_pos_new)): x_pos_new[kk] = x_pos_new[kk] - pred_U[kk] y_pos_new[kk] = y_pos_new[kk] - pred_V[kk] z_pos_new[kk] = z_pos_new[kk] - pred_W[kk] # --> correct new cell position pred_cell_0 = model_U.predict(cell_mesh_2[:,0]) pred_cell_1 = model_V.predict(cell_mesh_2[:,1]) pred_cell_2 = model_W.predict(cell_mesh_2[:,2]) cell_mesh_2_new = np.zeros(cell_mesh_2.shape) cell_mesh_2_new[:,0] = cell_mesh_2[:,0] - pred_cell_0 cell_mesh_2_new[:,1] = cell_mesh_2[:,1] - pred_cell_1 cell_mesh_2_new[:,2] = cell_mesh_2[:,2] - pred_cell_2 # --> plot MARS models Z_line = np.linspace(np.min(Z),np.max(Z),100) pred_line_U = model_U.predict(Z_line) pred_line_V = model_V.predict(Z_line) pred_line_W = model_W.predict(Z_line) plt.figure(figsize=(15,5)) plt.subplot(1,3,1) plt.plot(Z,U,'b.',label='x raw') plt.plot(Z_line,pred_line_U,'k--',label='fit') plt.xlabel('z position'); plt.ylabel('displacement') plt.tight_layout(); plt.legend(); plt.title('x displacements') plt.subplot(1,3,2) plt.plot(Z,V,'r.',label='y raw') plt.plot(Z_line,pred_line_V,'k--',label='fit') plt.xlabel('z position'); plt.ylabel('displacement') plt.tight_layout(); plt.legend(); plt.title('y displacements') plt.subplot(1,3,3) plt.plot(Z,W,'g.',label='z raw') plt.plot(Z_line,pred_line_W,'k--',label='fit') plt.xlabel('z position'); plt.ylabel('displacement') plt.tight_layout(); plt.legend(); plt.title('z displacements') plt.savefig(directory + '/translation_correction.png') return x_pos_new, y_pos_new, z_pos_new, cell_mesh_2_new
def fit_and_predict(X_training, Y_training, X_validation, hprm, assignments={}): #assert type(hprm['learning.model.benchmarks.independent_models']) == bool #assert type(hprm['learning.model.benchmarks.individual_inputs']) == bool # if not hprm['learning.model.benchmarks.independent_models']: # Y_hat_training, Y_hat_validation, model = call_fitter(inputs_training, # Y_training, # inputs_validation, # hprm, # ) # Y_hat_training = pd.DataFrame(Y_hat_training, # index = Y_training.index, # columns = Y_training.columns, # ) # Y_hat_validation = pd.DataFrame(Y_hat_validation, # index = inputs_validation.index, # columns = Y_training.columns, # ) # else: # Y_hat_training = pd.DataFrame(0, # index = Y_training.index, # columns = Y_training.columns, # ) # Y_hat_validation = pd.DataFrame(0, # index = inputs_validation.index, # columns = Y_training.columns, # ) # model = {} # if hprm['learning.model.benchmarks.individual_inputs']: # for ii, site_name in enumerate(Y_training.columns): # print('\r{0}/{1}'.format(ii, Y_training.shape[1]), end = '') # columns_to_keep = [ # (name_input, transformation, parameter, location) # for (name_input, transformation, parameter, location) in inputs_training.columns # if ( name_input not in assignments # or location in assignments[name_input] # ) # ] # Y_hat_training[site_name], Y_hat_validation[site_name], model[site_name] = call_fitter(inputs_training[columns_to_keep], # Y_training[site_name], # inputs_validation[columns_to_keep], # hprm, # ) # else : # for ii, site_name in enumerate(Y_training.columns): # print('\r{0}/{1}'.format(ii, Y_training.shape[1]), end = '') # Y_hat_training[site_name], Y_hat_validation[site_name], model[site_name] = call_fitter(inputs_training, # Y_training[site_name], # inputs_validation, # hprm, # ) # return Y_hat_training, Y_hat_validation, model # def call_fitter(X_training, # Y_training, # X_validation, # hprm, # ): X_mean = X_training.mean(axis=0) X_std = X_training.std(axis=0) X_training = (X_training - X_mean) / X_std X_validation = (X_validation - X_mean) / X_std method = hprm['learning.model'] if Y_training.ndim == 2 and Y_training.shape[1] == 1: Y_training = Y_training[:, 0] if method in {'random_forests', 'regression_tree'}: pass elif method in {'xgboost', 'svr', 'mars'}: assert Y_training.ndim == 1 if method == 'mars': model = Earth( verbose=hprm['mars.verbose'], thresh=hprm['mars.thresh'], ) elif method == 'random_forests': model = RandomForestRegressor( n_estimators=hprm['random_forests.n_estimators']) elif method == 'regression_tree': model = DecisionTreeRegressor() elif method == 'svr': model = SVR( C=hprm['svr.C'], epsilon=hprm['svr.epsilon'], ) elif method == 'xgboost': model = XGBRegressor() else: raise ValueError model.fit( X_training.values, Y_training.values, ) Y_hat_training = model.predict(X_training.values) Y_hat_validation = model.predict(X_validation.values) return Y_hat_training, Y_hat_validation, model
model = Earth(max_terms=50, max_degree=3) model.fit(X,y) #Print the model #print(model.trace()) print(model.summary()) print "MARS degree 5" model = Earth(max_terms=20, max_degree=5) model.fit(X,y) #Print the model #print(model.trace()) print(model.summary()) """ print "=====================================" print "MARS degree 1" model = Earth(max_terms=70, max_degree=1) print "Score: {}".format ( crossValidation ( model, X, y ) ) print "MARS degree 3" model = Earth(max_terms=50, max_degree=3) crossValidation ( model, X, y ) print "Score: {}".format ( crossValidation ( model, X, y ) )
def accuracy_on_higgs(): logger.info("Starting Accuracy Tests on Higgs") logger.info("================================") df = pd.read_csv('input/sample_higgs_0.01.csv', index_col=0) X = df[['m_bb','m_wwbb']].dropna().values y = df['label'] min_ = np.min(X, axis=0) max_ = np.max(X, axis=0) X = (X-min_) / (max_-min_) data = np.column_stack((X,y)) x = np.linspace(0.1,0.9,7) xx,yy = np.meshgrid(x,x) DIMS = X.shape[1] cov = np.identity(DIMS)*0.001 cluster_centers = np.column_stack((xx.ravel(),yy.ravel())) query_centers = [] #Generate queries over cluster centers for c in cluster_centers: queries = np.random.multivariate_normal(np.array(c), cov, size=40) query_centers.append(queries) query_centers = np.array(query_centers).reshape(-1,DIMS) ranges = np.random.uniform(low=0.005**(1/3), high=0.25**(1/3), size=(query_centers.shape[0], DIMS)) queries = [] empty = 0 for q,r in zip(query_centers,ranges): b = generate_boolean_vector(data,q,r,2) res = data[b] if res.shape[0]==0: empty+=1 ans = float(np.mean(res[:,-1])) if res.shape[0]!=0 else 0 qt = q.tolist() qt += r.tolist() qt.append(ans) queries.append(qt) qs = np.array(queries).reshape(-1, 2*DIMS+1) X_train, X_test, y_train, y_test = train_test_split( qs[:,:qs.shape[1]-1], qs[:,-1], test_size=0.4, random_state=0) earth = Earth() lsnr = PR(earth) lsnr.fit(X_train, y_train) y_hat = np.array([float(lsnr.get_model(x.reshape(1,-1)).predict(x.reshape(1,-1))) for x in X_test]) r2 = metrics.r2_score(y_test,y_hat) kl = kl_divergence_error(y_test, y_hat) nrmse = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test) logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2, nrmse, kl)) #Linear Regression comparsion lr = LinearRegression() lr.fit(X_train, y_train) y_hat_lr = lr.predict(X_test) r2_lr = metrics.r2_score(y_test, y_hat_lr) kl_lr = kl_divergence_error(y_test, y_hat_lr) nrmse_lr = np.sqrt(metrics.mean_squared_error(y_test, y_hat_lr))/np.mean(y_test) logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2_lr, kl_lr, nrmse_lr)) dic = {} dic['LPM' ]= [('r2',r2), ('kl',kl), ('nrmse',nrmse)] dic['LR'] = [('r2',r2_lr), ('kl',kl_lr), ('nrmse',nrmse_lr)] #Polynomial regression comparsion for count, degree in enumerate(np.arange(3,10,2)): model = make_pipeline(PolynomialFeatures(degree), Ridge()) model.fit(X_train, y_train) y_hat = model.predict(X_test) r2_p = metrics.r2_score(y_test,y_hat) kl_p = kl_divergence_error(y_test, y_hat) nrmse_p = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test) dic["LR ({})".format(degree)] = [('r2',r2_p), ('kl',kl_p), ('nrmse',nrmse_p)] print("R2 for degree {} : {}".format(degree, metrics.r2_score(y_test, y_hat))) logger.info("==============================================") with open('output/Accuracy/multiple_methods_higgs.pkl', 'wb') as handle: pickle.dump(dic, handle)
10 * numpy.random.normal(size=m) y2 = 100 * \ (numpy.cos((X[:, 5] + X[:, 6]) / 20) - 4.0) + \ 10 * numpy.random.normal(size=m) y = numpy.concatenate([y1[:, None], y2[:, None]], axis=1) missing = numpy.random.binomial(1, .2, (m, n)).astype(bool) X_full = X.copy() X[missing] = None idx5 = (1 - missing[:, 5]).astype(bool) idx6 = (1 - missing[:, 6]).astype(bool) # Fit an Earth model model = Earth(max_degree=5, minspan_alpha=.5, allow_missing=True, enable_pruning=True, thresh=.001, smooth=True, verbose=True) model.fit(X, y) # Print the model print(model.trace()) print(model.summary()) # Plot the model y_hat = model.predict(X) fig = plt.figure() for j in [0, 1]: ax1 = fig.add_subplot(3, 4, 1 + 2 * j)
def generate_data(): alpha1 = 0 beta1 = 1 n_samples = 50 noise1 = 2 * np.random.randn(n_samples) x1 = np.linspace(1, 50, 50) y1 = alpha1 + beta1 * x1 + noise1 alpha2 = (alpha1 + beta1 * 50) * 2 beta2 = -1 n_samples = 50 noise2 = 2 * np.random.randn(n_samples) x2 = np.linspace(50, 100, 50) y2 = alpha2 + beta2 * x2 + noise2 x = np.concatenate((x1, x2), axis=None) y = np.concatenate((y1, y2), axis=None) return x, y x, y = generate_data() mars = Earth(max_terms=2) mars.fit(x, y) Y_hat = mars.predict(x) plt.scatter(x, y) plt.scatter(x, Y_hat) plt.show()
def getModel(config, modelname): info("Getting {0} Model".format(modelname), ind=0) problemType = config['problem'] modelData = getModelData(config, modelname) modelParams = modelData.get('params') retval = None ########################################################################### # Classification ########################################################################### if isClassification(problemType): if modelname == "logistic": retval = classifier(modelname, LogisticRegression(), modelParams) if modelname == "sgd": retval = classifier(modelname, SGDClassifier(), modelParams) if modelname == "passagg": retval = classifier(modelname, PassiveAggressiveClassifier(), modelParams) if modelname == "mlp": retval = classifier(modelname, MLPClassifier(), modelParams) if modelname == "xgboost": retval = classifier(modelname, XGBClassifier(), modelParams) if modelname == "gaussproc": retval = classifier(modelname, GaussianProcessClassifier(), modelParams) if modelname == "lda": retval = classifier(modelname, LinearDiscriminantAnalysis(), modelParams) if modelname == "qda": retval = classifier(modelname, QuadraticDiscriminantAnalysis(), modelParams) if modelname == "nb": retval = classifier(modelname, GaussianNB(), modelParams) if modelname == "nbbern": retval = classifier(modelname, BernoulliNB(), modelParams) if modelname == "nbmulti": retval = classifier(modelname, MultinomialNB(), modelParams) if modelname == "dtree": retval = classifier(modelname, DecisionTreeClassifier(), modelParams) if modelname == "kneighbors": retval = classifier(modelname, KNeighborsClassifier(), modelParams) if modelname == "rneighbors": retval = classifier(modelname, RadiusNeighborsClassifier(), modelParams) if modelname == "svmlin": retval = classifier(modelname, LinearSVC(), modelParams) if modelname == "svmnupoly": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmnulinear": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmnusigmoid": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmnurbf": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmepspoly": retval = classifier(modelname, SVC(), modelParams) if modelname == "svmepslinear": retval = classifier(modelname, SVC(), modelParams) if modelname == "svmepssigmoid": retval = classifier(modelname, SVC(), modelParams) if modelname == "svmepsrbf": retval = classifier(modelname, SVC(), modelParams) if modelname == "rf": retval = classifier(modelname, RandomForestClassifier(), modelParams) if modelname == "extratrees": retval = classifier(modelname, ExtraTreesClassifier(), modelParams) if modelname == "adaboost": retval = classifier(modelname, AdaBoostClassifier(), modelParams) if modelname == "gbm": retval = classifier(modelname, GradientBoostingClassifier(), modelParams) if modelname == "tpot": retval = classifier(modelname, TPOTClassifier(), modelParams) ####################################################################### # Regression ####################################################################### if modelname == "lightning": retval = external.extlightning.createLightningClassifier( modelParams) ########################################################################### # Regression ########################################################################### if isRegression(problemType): if modelname == "linear": retval = classifier(modelname, LinearRegression(), modelParams) if modelname == "ridge": retval = classifier(modelname, Ridge(), modelParams) if modelname == "lasso": retval = classifier(modelname, Lasso(), modelParams) if modelname == "elasticnet": retval = classifier(modelname, ElasticNet(), modelParams) if modelname == "omp": retval = classifier(modelname, OrthogonalMatchingPursuit(), modelParams) if modelname == "bayesridge": retval = classifier(modelname, BayesianRidge(), modelParams) if modelname == "ard": retval = classifier(modelname, ARDRegression(), modelParams) if modelname == "sgd": retval = classifier(modelname, SGDRegressor(), modelParams) if modelname == "passagg": retval = classifier(modelname, PassiveAggressiveRegressor(), modelParams) if modelname == "perceptron": retval = None if modelname == "huber": retval = classifier(modelname, HuberRegressor(), modelParams) if modelname == "theilsen": retval = classifier(modelname, TheilSenRegressor(), modelParams) if modelname == "ransac": retval = classifier(modelname, RANSACRegressor(), modelParams) if modelname == "mlp": retval = classifier(modelname, MLPRegressor(), modelParams) if modelname == "xgboost": retval = classifier(modelname, XGBRegressor(), modelParams) if modelname == "gaussproc": retval = classifier(modelname, GaussianProcessRegressor(), modelParams) if modelname == "dtree": retval = classifier(modelname, DecisionTreeRegressor(), modelParams) if modelname == "kneighbors": retval = classifier(modelname, KNeighborsRegressor(), modelParams) if modelname == "rneighbors": retval = classifier(modelname, RadiusNeighborsRegressor(), modelParams) if modelname == "svmlin": retval = classifier(modelname, LinearSVR(), modelParams) if modelname == "svmnupoly": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmnulinear": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmnusigmoid": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmnurbf": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmepspoly": retval = classifier(modelname, SVR(), modelParams) if modelname == "svmepslinear": retval = classifier(modelname, SVR(), modelParams) if modelname == "svmepssigmoid": retval = classifier(modelname, SVR(), modelParams) if modelname == "svmepsrbf": retval = classifier(modelname, SVR(), modelParams) if modelname == "rf": retval = classifier(modelname, RandomForestRegressor(), modelParams) if modelname == "extratrees": retval = classifier(modelname, ExtraTreesRegressor(), modelParams) if modelname == "adaboost": retval = classifier(modelname, AdaBoostRegressor(), modelParams) if modelname == "gbm": retval = classifier(modelname, GradientBoostingRegressor(), modelParams) if modelname == "isotonic": retval = classifier(modelname, IsotonicRegression(), modelParams) if modelname == "earth": retval = classifier(modelname, Earth(), modelParams) if modelname == "symbolic": retval = classifier(modelname, SymbolicRegressor(), modelParams) if modelname == "tpot": retval = classifier(modelname, TPOTRegressor(), modelParams) if retval is None: raise ValueError( "No model with name {0} was created".format(modelname)) model = retval.get() return model
from pyearth import Earth import joblib y_s = 0 y_s = sys.argv[1] n = y_s ans = 0 for i in range(len(n)): if n[i].isnumeric(): ans = ans + int(n[i]) * pow(10, (len(n) - 1) - i) gdpdata = pd.read_csv( "/home/cheeryluck/PycharmProjects/djangoProject2/data1/IndiaGDP.csv", header=None) labels = ['Year', 'GDP'] gdpdata.columns = labels train, test = train_test_split(gdpdata) model6 = Earth().fit(train.iloc[:, :1], train.iloc[:, 1:]) ycap6 = model6.predict(test.iloc[:, :1]) error = mean_squared_error(test.iloc[:, :1], ycap6) model6.predict([[2019]]) joblib.dump( model6, '/home/cheeryluck/PycharmProjects/djangoProject2/data1/GDP_Model.sav') impmodel = joblib.load( '/home/cheeryluck/PycharmProjects/djangoProject2/data1/GDP_Model.sav') print(impmodel.predict([[2019]]))
total_data = pd.concat([ total_category_data, total_numeric_data.clip(total_numeric_data.quantile(0.01).to_dict(), total_numeric_data.quantile(0.99).to_dict(), axis=1) ], axis=1) print(total_data.shape) total_data = total_data.fillna(total_data.mean()) print(total_data.head(5)) train_data = total_data[total_data.index < 1460] test_data = total_data[total_data.index >= 1460] rfe = RFE(Earth(), step=15, verbose=2).fit(train_data, train_Y) validKeys = list(train_data.columns[rfe.support_]) train_data = train_data[validKeys] test_data = test_data[validKeys] model = Earth().fit(train_data, train_Y) predict = model.predict(test_data) predict = np.exp(predict) submission = pd.DataFrame() submission['Id'] = test_index submission['SalePrice'] = predict submission.to_csv( "C:\\Users\\hongj\\Desktop\\kaggle\\house_price\\submission.csv", index=False)
import matplotlib.pyplot as plt from pyearth import Earth np.random.seed(1) m = 1000 n = 5 X = np.random.normal(size=(m, n)) # Make X[:,1] binary X[:, 1] = np.random.binomial(1, .5, size=m) # The response is a linear function of the inputs y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.normal(size=m) # Fit the earth model model = Earth().fit(X, y) # Print the model summary, showing linear terms print model.summary() # Plot for both values of X[:,1] y_hat = model.predict(X) plt.figure() plt.plot(X[:, 0], y, 'k.') plt.plot(X[X[:, 1] == 0, 0], y_hat[X[:, 1] == 0], 'r.', label='$x_1 = 0$') plt.plot(X[X[:, 1] == 1, 0], y_hat[X[:, 1] == 1], 'b.', label='$x_1 = 1$') plt.legend(loc='best') plt.xlabel('$x_0$') plt.show()
X = np.array(X) y = np.sin(X) + np.random.normal(size=X.shape[0])/10.0 #Defining different knots which will be used as a parameter for MARS model knots = [2,4,5,10] #Helpful in creating graph axis = [[0,0],[0,1],[1,0],[1,1]] #Defining different max_degree parameter for MARS model parameter for degree in range(1,5): fig,ax = plt.subplots(2,2,figsize=(10, 10)) for num_knot in range(4): # Defining MARS model with max_term and max_degree parameter model = Earth(max_terms=knots[num_knot],max_degree=degree,verbose=0) #Fitting the dataset on the dataset model.fit(X, y) #Prediction model output y_hat = model.predict(X) #Potting graphs ax[axis[num_knot][0],axis[num_knot][1]].title.set_text(f"degree = {degree}, knots = {knots[num_knot]}") ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y,'r.') ax[axis[num_knot][0],axis[num_knot][1]].plot(X,y_hat,'b.') plt.show() # Plotting dataset distribution plt.figure()
A simple example plotting a fit of the absolute value function. """ import numpy import matplotlib.pyplot as plt from pyearth import Earth # Create some fake data numpy.random.seed(2) m = 1000 n = 10 X = 80 * numpy.random.uniform(size=(m, n)) - 40 y = numpy.abs(X[:, 6] - 4.0) + 1 * numpy.random.normal(size=m) # Fit an Earth model model = Earth(max_degree=1) model.fit(X, y) # Print the model print(model.trace()) print(model.summary()) # Plot the model y_hat = model.predict(X) plt.figure() plt.plot(X[:, 6], y, 'r.') plt.plot(X[:, 6], y_hat, 'b.') plt.show()
st = 'CPY012' target,start_p,stop_p,host_path=station_sel(st,mode) if mode =='hour': n_past,n_future = 24*7,72 elif mode =='day': n_past,n_future = 60,30 data = df[start_p:stop_p] data['Day'] = data.index.dayofyear #add day data = data.interpolate(limit=300000000,limit_direction='both').astype('float32') #interpolate neighbor first, for rest NA fill with mean() conclude_df=pd.DataFrame() for n_out in range(1,n_future+1): X,y,xlabels = to_supervise(data,target,n_out) criteria = ('rss', 'gcv', 'nb_subsets') model = Earth(enable_pruning = True, # max_degree=3, # max_terms=20, minspan_alpha=.5, feature_importance_type=criteria, verbose=True) model.fit(X,y,xlabels=xlabels) nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83] gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83] rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83] rss,gcv,nbsub = toDF(rss),toDF(gcv),toDF(nbsub) top20=pd.concat([rss,gcv,nbsub],ignore_index=True) top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature') top20['timestep'] = n_out #ADDED combine all result conclude_df = pd.concat([conclude_df,top20],ignore_index=True) if mode=='day':
A simple example plotting a fit of the sine function. """ import numpy import matplotlib.pyplot as plt from pyearth import Earth # Create some fake data numpy.random.seed(2) m = 10000 n = 10 X = 80 * numpy.random.uniform(size=(m, n)) - 40 y = 100 * \ numpy.abs(numpy.sin((X[:, 6]) / 10) - 4.0) + \ 10 * numpy.random.normal(size=m) # Fit an Earth model model = Earth(max_degree=3, minspan_alpha=.5) model.fit(X, y) # Print the model print(model.trace()) print(model.summary()) # Plot the model y_hat = model.predict(X) plt.plot(X[:, 6], y, 'r.') plt.plot(X[:, 6], y_hat, 'b.') plt.show()
from matplotlib import pyplot from sklearn import preprocessing from sklearn.feature_extraction import DictVectorizer from pyearth import Earth from matplotlib import pyplot df = pd.read_excel('relay-foods.xlsx', sheetname='Purchase Data - Full Study') df['OrderId'] = df['OrderId'].astype('category') df['CommonId'] = df['CommonId'].astype('category') df['OrderId'] = df['OrderId'].astype('category') df['CommonId'] = df['CommonId'].astype('category') df.dtypes col_names = ['OrderDate', 'PickupDate'] df = df.drop(col_names, axis=1) y = df['TotalCharges'] df_2 = df[['OrderId', 'UserId', 'PupId']] #del df['OrderDate'] X = [dict(r.iteritems()) for _, r in df_2.iterrows()] train_fea = DictVectorizer().fit_transform(X) #Fit an Earth model model = Earth() model.fit(train_fea, y) #Print the model print(model.trace()) print(model.summary()) #Plot the model y_hat = model.predict(X)
def train(object_name, data_dir, output_dir, train_type, classifier_type, learned_model=None, debug=False): from sklearn import linear_model, tree from sklearn.svm import SVR from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import AdaBoostRegressor if classifier_type == 'Earth': from pyearth import Earth import numpy as np have_graphviz = True try: import graphviz except: have_graphviz = False ans = None saso_data = load_data_file(object_name, data_dir) if train_type == 'gripper_status': action_str = 'gs' actions = range(CLOSE_ACTION_ID + 1) x = [] y = [] x_index = [] for action in actions: for sasor in saso_data[action]: #x_entry = sasor['touch_prev'] + sasor['init_joint_values'] x_entry = sasor['next_joint_values'] x_entry = x_entry + sasor['next_gripper'] + sasor['next_object'] x_entry.append(sasor['next_object'][0] - sasor['next_gripper'][0]) x_entry.append(sasor['next_object'][1] - sasor['next_gripper'][1]) x.append(x_entry) x_index.append(sasor['index']) if action == CLOSE_ACTION_ID: y.append(1) else: y.append(0) #gripper open if train_type == 'pick_success_probability': action_str = repr(PICK_ACTION_ID) x = [] y = [] x_index = [] for sasor in saso_data[PICK_ACTION_ID]: #x_entry = sasor['touch_prev'] + sasor['init_joint_values'] x_entry = sasor['init_joint_values'] x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[ 'init_object'][0:3] x_entry.append(sasor['init_object'][0] - sasor['init_gripper'][0]) x_entry.append(sasor['init_object'][1] - sasor['init_gripper'][1]) x.append(x_entry) x_index.append(sasor['index']) if sasor['reward'] > 0: y.append(1) else: y.append(0) if train_type in ['pick_success_probability', 'gripper_status']: if learned_model is not None: logistic = learned_model else: print classifier_type if classifier_type == 'DTC': logistic = DecisionTreeClassifier(criterion='entropy') else: logistic = linear_model.LogisticRegression(max_iter=400, C=1.0) logistic.fit(x, y) joblib.dump( logistic, output_dir + '/' + classifier_type + '-' + action_str + '.pkl') ans = logistic print logistic.score(x, y) print logistic.get_params() print len(x) if classifier_type != 'DTC': print logistic.coef_ print logistic.intercept_ yaml_out = {} yaml_out['coef'] = logistic.coef_.tolist()[0] yaml_out['intercept'] = logistic.intercept_.tolist()[0] write_config_in_file( output_dir + '/' + classifier_type + '-' + action_str + ".yaml", yaml_out) else: print logistic.feature_importances_ #feature_names=['t1','t2', 'j1', 'j2'] feature_names = [ 'j1', 'j2' ] #Touch not required when object coordinates are known feature_names = feature_names + [ 'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw' ][0:3] feature_names = feature_names + [ 'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow' ][0:3] feature_names = feature_names + ['xrel', 'yrel'] if have_graphviz: dot_data = tree.export_graphviz(logistic, out_file=None, feature_names=feature_names, filled=True) graph = graphviz.Source(dot_data) graph.render(output_dir + '/' + classifier_type + '-' + action_str) yaml_out = {} yaml_out["max_depth"] = logistic.tree_.max_depth yaml_out["values"] = logistic.tree_.value yaml_out['n_nodes'] = logistic.tree_.node_count yaml_out['children_left'] = logistic.tree_.children_left yaml_out['children_right'] = logistic.tree_.children_right yaml_out['feature'] = logistic.tree_.feature yaml_out['threshold'] = logistic.tree_.threshold write_config_in_file( output_dir + '/' + classifier_type + '-' + action_str + ".yaml", yaml_out) if debug: for i in range(0, len(x)): y_bar = logistic.predict([x[i]]) if y_bar != y[i]: print x_index[i] print x[i] print y[i] print logistic.predict_proba([x[i]]) if classifier_type != 'DTC': print logistic.decision_function([x[i]]) prob = (np.dot(logistic.coef_[0], x[i]) + logistic.intercept_[0]) print prob prob *= -1 prob = np.exp(prob) prob += 1 prob = np.reciprocal(prob) print prob if 'next_state' in train_type: actions = range(10) # predictions can be 18, 7 for gripper pose, 7 for objct pose # 2 for joint values # 2 for touch values predictions = range(NUM_PREDICTIONS) train_type_array = train_type.split('_') for s in train_type_array: if 'action' in s: actions = s.split('-')[1:] if 'pred' in s: predictions = s.split('-')[1:] ans = {} for action_ in actions: action = int(action_) x = [] y = [] y_c = [] l_reg = [] l_reg_c = [] x_index = [] for i in range(0, NUM_PREDICTIONS): y.append([]) y_c.append([]) l_reg.append('') l_reg_c.append('') for sasor in saso_data[action]: if sasor['reward'] > -999: #discard invalid states x_entry = sasor['init_joint_values'] x_entry = x_entry + sasor['init_gripper'][0:3] + sasor[ 'init_object'][0:3] x_entry.append(sasor['init_object'][0] - sasor['init_gripper'][0]) x_entry.append(sasor['init_object'][1] - sasor['init_gripper'][1]) x.append(x_entry) x_index.append(sasor['index']) for p_ in predictions: p = int(p_) y[p].append(get_prediction_value(sasor, p)) y_default = get_default_value(sasor, p) y_c[p].append(is_correct(p, y[p][-1], y_default)) """ try: check_array(x) check_array(y[p]) except: print x[-1] print y[p][-1] print sasor['index'] assert(0==1) """ print len(x) ans[action] = {} for p_ in predictions: p = int(p_) if learned_model is not None: l_reg[p] = learned_model[action][p] else: if classifier_type == 'ridge': l_reg[p] = linear_model.Ridge(alpha=0.5, normalize=True) elif classifier_type == 'SVR': l_reg[p] = SVR(epsilon=0.2) elif classifier_type in ['DTR', 'DTRM']: l_reg[p] = DecisionTreeRegressor() elif classifier_type == 'DTC': l_reg[p] = DecisionTreeClassifier() elif classifier_type == 'Earth': l_reg[p] = Earth() elif classifier_type == 'AdaLinear': l_reg[p] = AdaBoostRegressor( linear_model.LinearRegression()) else: l_reg[p] = linear_model.LinearRegression() if classifier_type == 'DTRM': l_reg[p].fit(x, np.transpose(np.array(y))) elif classifier_type == 'DTC': l_reg[p].fit(x, y_c[p]) else: l_reg[p].fit(x, y[p]) joblib.dump( l_reg[p], output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p) + '.pkl') ans[action][p] = l_reg[p] if classifier_type == 'DTRM': print repr(action) + " " + repr(p) + " " + repr( l_reg[p].score(x, np.transpose(np.array(y)))) elif classifier_type == 'DTC': print repr(action) + " " + repr(p) + " " + repr( l_reg[p].score(x, y_c[p])) else: print repr(action) + " " + repr(p) + " " + repr( l_reg[p].score(x, y[p])) print l_reg[p].get_params() if classifier_type not in [ 'SVR', 'DTR', 'DTRM', 'AdaLinear', 'DTC' ]: print l_reg[p].coef_ if classifier_type not in [ 'DTR', 'DTRM', 'AdaLinear', 'DTC', 'Earth' ]: print l_reg[p].intercept_ if classifier_type in ['Earth']: for j in range(0, len(x)): predict_earth(l_reg[p], x[j]) print l_reg[p].summary() if learned_model is None: if classifier_type in ['DTR', 'DTRM', 'AdaLinear', 'DTC']: print l_reg[p].feature_importances_ feature_names = ['j1', 'j2'] feature_names = feature_names + [ 'gx', 'gy', 'gz', 'gxx', 'gyy', 'gzz', 'gw' ][0:3] feature_names = feature_names + [ 'ox', 'oy', 'oz', 'oxx', 'oyy', 'ozz', 'ow' ][0:3] feature_names = feature_names + ['xrel', 'yrel'] if have_graphviz: dot_data = tree.export_graphviz( l_reg[p], out_file=None, feature_names=feature_names, filled=True) graph = graphviz.Source(dot_data) graph.render(output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p)) yaml_out = {} yaml_out['max_depth'] = l_reg[p].tree_.max_depth yaml_out["values"] = l_reg[p].tree_.value.tolist() yaml_out['n_nodes'] = l_reg[p].tree_.node_count yaml_out['children_left'] = l_reg[ p].tree_.children_left.tolist() yaml_out['children_right'] = l_reg[ p].tree_.children_right.tolist() yaml_out['feature'] = l_reg[p].tree_.feature.tolist() yaml_out['threshold'] = l_reg[ p].tree_.threshold.tolist() write_config_in_file( output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p) + ".yaml", yaml_out) if classifier_type in ['Earth']: yaml_out = get_yaml_earth(l_reg[p]) write_config_in_file( output_dir + '/' + classifier_type + "-" + repr(action) + "-" + repr(p) + ".yaml", yaml_out) if classifier_type == 'DTRM': i = 0 y_bar = l_reg[p].predict([x[i]]) print x_index[i] print x[i] y_t = np.transpose(np.array(y)) print repr(y_t[i]) + ' Prediction ' + repr(y_bar) break if debug: for i in range(0, len(x)): y_bar = l_reg[p].predict([x[i]]) if classifier_type == 'DTC': if y_bar != y_c[p][i]: print x_index[i] print x[i] print y_c[p][i] print y[p][i] print l_reg[p].predict_proba([x[i]]) else: if is_correct(p, y_bar, y[p][i]) == 0: print x_index[i] print x[i] print repr( y[p][i]) + ' Prediction ' + repr(y_bar) return ans
def test_score(): earth = Earth(**default_params) model = earth.fit(X, y) record = model.pruning_trace() rsq = record.rsq(record.get_selected()) assert_almost_equal(rsq, model.score(X, y))
y = pm.reshape(-1, 1) print y.shape #获得y的统计信息 #statistics(y) ##频率分布图 #drawHist(y,'PM2.5','Frequency','the Frequency of PM2.5') ##频率累计图 #drawCumulativeHist(y,'PM2.5','Frequency','Curve cumulative of PM2.5') ##箱图 #drawBox(y,'PM2.5','BOX of PM2.5') ########################################################### #MARS拟合 #1)Fit an Earth model criteria = ('rss', 'gcv', 'nb_subsets') model = Earth(max_degree=2, feature_importance_type=criteria) model.fit(X, y) #这里用的是标准化之后的数据 #2)Print the model模型结果 print(model.trace()) print(model.summary()) print(model.summary_feature_importances(sort_by='gcv')) #3)预测的y y_hat = model.predict(X) #评价指标 #R_2=R2((y_hat.reshape(-1,1)),(y.reshape(-1,1))) R_square = metrics.r2_score((y.reshape(-1, 1)), (y_hat.reshape(-1, 1))) #计算r2,来表示y与拟合y_hat的接近程度 RMSE = sqrt(metrics.mean_squared_error(y.reshape(-1, 1), y_hat.reshape(-1, 1)))
def test_export_python_function(): for smooth in (True, False): model = Earth(penalty=1, smooth=smooth, max_degree=2).fit(X, y) export_model = export_python_function(model) for exp_pred, model_pred in zip(model.predict(X), export_model(X)): assert_almost_equal(exp_pred, model_pred)
X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['1stFlrSF'] + X_train[ '2ndFlrSF'] X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test[ '2ndFlrSF'] #normality check for the target ax = sns.distplot(y_train) plt.show() #log transform the dependent variable for normality y_train = np.log(y_train) ax = sns.distplot(y_train) plt.show() #mars solution model = Earth() model = Earth( max_degree=2, penalty=1.0, minspan_alpha=0.01, endspan_alpha=0.01, endspan=5 ) #2nd degree formula is necessary to see interactions, penalty and alpha values for making model simple model.fit(X_train, y_train) model.score(X_train, y_train) #y_pred = model.predict(train["SalePrice"]) y_pred = model.predict(X_test) y_pred = np.exp(y_pred) # inverse log transform the results
) # Select target and feature dataset(s) --> [target, feature1, feature2, ... ] datasets = [ Dataset('runoff', database), Dataset('runoff', database).normalized(), Dataset('temp', database).normalized(), Dataset('precip', database).normalized(), Dataset('season', database).normalized() ] # Select leadtimes for target and feature. negative:past/positive:future leadtimes = [[1, 3], [-4, -1], [-4, -1], [-4, -1], [1, 1]] # Select Model model_type = Earth(max_degree=10, smooth=True) #model_type= Lasso(alpha=0.05,normalize=True, max_iter=3000) #model_type = Regressor( # layers=[ # Layer("Sigmoid",units=5), # Layer("Linear", units=1)], # learning_rate=0.1, # n_iter=1000) # Set training interval startyear = DateFormat(1900, 1) endyear = DateFormat(2005, 36) training_daterange = DateFormat.decadal_daterange(startyear, endyear) # Set testing interval startyear = DateFormat(2006, 1)
y1 = 120 * np.abs(np.sin((X[:, 6]) / 6) - 1.0) + 15 * np.random.normal(size=m) y2 = 120 * np.abs(np.sin((X[:, 5]) / 6) - 1.0) + 15 * np.random.normal(size=m) y1 = (y1 - y1.mean()) / y1.std() y2 = (y2 - y2.mean()) / y2.std() y_mix = np.concatenate((y1[:, np.newaxis], y2[:, np.newaxis]), axis=1) alphas = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1] n_plots = len(alphas) k = 1 fig = plt.figure(figsize=(10, 15)) for i, alpha in enumerate(alphas): # Fit an Earth model model = Earth(max_degree=5, minspan_alpha=.05, endspan_alpha=.05, max_terms=10, check_every=1, thresh=0.) output_weight = np.array([alpha, 1 - alpha]) model.fit(X, y_mix, output_weight=output_weight) print(model.summary()) # Plot the model y_hat = model.predict(X) mse = ((y_hat - y_mix) ** 2).mean(axis=0) ax = plt.subplot(n_plots, 2, k) ax.set_ylabel("Run {0}".format(i + 1), rotation=0, labelpad=20) plt.plot(X[:, 6], y_mix[:, 0], 'r.') plt.plot(X[:, 6], model.predict(X)[:, 0], 'b.') plt.title("MSE: {0:.3f}, Weight : {1:.1f}".format(mse[0], alpha))