def test_split_node(self): regressor = Regressor() regressor.fit(x, y) print(regressor.predict(x_valid))
'Age_categories_Senior'] target_column = 'Survived'; r.train_machine(train[columns], train[target_column]); holdout = test; all_X = train[columns] all_y = train[target_column] train_x, test_x, train_y, test_y = sp.split(train[columns], train[target_column]); # toPrint = sr.get_train()['Age'].describe(); # print(toPrint) r.train_machine(train_x, train_y); predictions = r.predict(test_x); accuracy = mt.model_accuracy(test_y, predictions); regressor_object = Regressor(); reg = regressor_object.get_regressor(); mt.set_cross_score(reg, all_X, all_y, 10) mt.sort_score(); scores = mt.get_scores(); cross_accurace = mt.get_mean(); regressor_object_1 = Regressor(); regressor_object_1.train_machine(all_X, all_y); prediction = r.predict(holdout[columns]); # back_x = train_x;
Y_train=np.array(Y_train) X_train=np.array(X_train) X_test=np.array(X_test) #### Creation of regressor reg=Regressor() #### Cross validation print "Cross validation ..." #loo = cross_validation.LeaveOneOut(len(y_df)) loo=10 scores = cross_validation.cross_val_score(reg, X_train, Y_train, scoring='mean_squared_error', cv=loo,) print "The score mean of cross validation : " print scores.mean() #### fit print "Fit ..." reg.fit(X_train, Y_train) #### Prediction print "Prediction ..." Y_pred = reg.predict(X_test) #### write the submission print "Write the submission ..." make_submission(dataTest,Y_pred) print "End."
T0 = time() print "load dataset..." X_df_2011 = pd.DataFrame.from_csv("datasets/2011.csv") X_df_2012 = pd.DataFrame.from_csv("datasets/2012.csv") X_df_2013 = pd.DataFrame.from_csv("datasets/2013.csv") X_df = pd.concat([X_df_2011, X_df_2012, X_df_2013], axis=0) print "load dates..." with open("target_dates_1.pkl") as f: dates = pickle.load(f) # date n1677, n3051 and n3451 cause trouble dates = dates.delete([1677, 3051, 3451]) sub = load_submission("data/submission.txt") pred_dates = sub.index fit_dates = load_all_data().index fit_dates = fit_dates.delete(range(18024)) # hack print "make the prediction..." # make prediction reg = Regressor() reg.fit(fit_dates) pred = reg.predict(pred_dates) print "acquire the true value..." target = X_df.loc[dates] print "compute error..." # get the error err = get_error_dfs(pred, target) print "LinExp error: ", err, "run in :", time() - T0, "s"
# Visualizing the results visualizer = Visualizer() visualizer.plot_classifier_regressor(y_test, y_predicted, method_identifier) print('The accuracy is: ' + str(classifier_accuracy) + ' %') print(algorithm_name) # ---------------------Applying Regression to the data-------------------------- elif method_identifier == 2: from regressor import Regressor regressor = Regressor(algorithm_name) y_predicted = regressor.predict(X_train, y_train, X_test) regressor_score = regressor.get_score(y_test, y_predicted) # Visualizing the results visualizer = Visualizer() visualizer.plot_classifier_regressor(y_test, y_predicted, method_identifier) print('The coefficient of determination is: ' + str(regressor_score)) print(algorithm_name) # ---------------------Clustering the data------------------------------------ elif method_identifier == 3: from clustering import Clustering
min_max_scaler = MinMaxScaler() df = pd.read_csv("market-price-2014.csv") df_norm = df.drop(df.columns[0], 1, inplace=True) data_splitter = DataSplitter(df) df_train, df_validate, df_test = data_splitter.train_validate_test_split() data_splitter = DataSplitter(df_train) x_train, y_train = data_splitter.get_XY_sets(min_max_scaler, 30, 5) data_splitter = DataSplitter(df_validate) x_validate, y_validate = data_splitter.get_XY_sets(min_max_scaler, 30, 5) regressor = Regressor(x_train, y_train, x_validate, y_validate).train() # PREDICT PRICE test_set = df_test.values data_splitter = DataSplitter(df_test) inputs, outputs = data_splitter.get_XY_sets(min_max_scaler, 30, 5) predicted_price = regressor.predict(inputs) x = np.array(outputs).ravel() y = np.array(predicted_price).ravel() rmse = sqrt(mean_squared_error(x, y)) print('RMSE: %.3f' % rmse) predicted_price = min_max_scaler.inverse_transform( np.array(predicted_price[-2]).reshape(-1, 1)).tolist() plotter = Plotter(test_set[-10:-5], predicted_price) plotter.plot()
# print(test.columns) # print(train.columns) train_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S', 'Embarked_missing_data']; test_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S', 'Embarked_missing_data']; # print(train[train_columns].to_string()) train[train_columns],test[test_columns] = processor_ms.scale_fit_train_test(train[train_columns],test[test_columns]); # print(train[train_columns].to_string()) # print(test[test_columns]) # Regressor regressor_object_1 = Regressor(); regressor_object_1.train_machine(train[train_columns], train['Survived']); prediction = regressor_object_1.predict(test[train_columns]); prediction = prediction.astype(int); print(prediction); # ################# # SUBMIT ANSWER # ################# # print(test.columns); holdout_ids = test["PassengerId"]; sub_df = { "PassengerId":holdout_ids, "Survived": prediction }; ds = Data_Set(sub_df); ds.to_csv("normalised_submission_test_fitted");
max_p_value = 1 non_significant_index = -1 eliminator = None while max_p_value > 0.05: if not non_significant_index == -1: x_train = np.delete(x_train, non_significant_index, 1) x_test = np.delete(x_test, non_significant_index, 1) eliminator = Back_Elimination() eliminator.fit_OLS(y_train, x_train) p_values = eliminator.get_p_values() max_p_value = np.amax(p_values) non_significant_index = list(p_values).index(max_p_value) """ LOGISTIC REGRESSION """ regressor = Regressor() regressor.train_machine(x_train, y_train) prediction = regressor.predict(x_test) print(prediction) # ################# # SUBMIT ANSWER # ################# # print(test.columns); holdout_ids = df_test['Id'] sub_df = { "Id": holdout_ids, "Cover_Type": prediction } ds = Data_Set(sub_df) ds.to_csv("submission", index=false)
# regressorA = linear_model.RANSACRegressor() # regressorA = RadiusNeighborsRegressor(radius=1.0) # regressorA = KNeighborsRegressor(n_neighbors=4) regressorB = MetaRegressor([regressorB2]) regressorA = MetaRegressor( [regressorA1, regressorA2, regressorA3, regressorA4, regressorA5]) baseRegressor = linear_model.LinearRegression() regressor = Regressor(regressorA, regressorB, baseRegressor) regressor.fit(historic_data_set, target_data_set) # plot the trained models against the data they were trained on # together with least squares measures(in order to experiment with diff linear models) predict_base, predict_anomaly, predict_total, predict_dummy = regressor.predict( historic_data_set) plt.figure(1) plt.subplot(311) plt.plot(predict_total, label="total") plt.plot(predict_base, label="base") plt.plot(predict_anomaly, label="anomaly") plt.plot(target_data_set, label="target") plt.plot(predict_dummy, label="dummy") plt.grid(True) plt.legend() # plot the predicted values (by the model) against the actual prices for that week # it is this prediction that we'll feed to the scheduler #exp
df_features = df.drop('target', axis=1) y = df.target.values df_train, df_test, y_train, y_test = train_test_split(df_features, y, test_size=0.5, random_state=42) feature_extractor = FeatureExtractor() model = Regressor() X_train = feature_extractor.transform(df_train) model.fit(X_train, y_train) X_test = feature_extractor.transform(df_test) y_pred = model.predict(X_test) print('RMSE = ', np.sqrt(mean_squared_error(y_test, y_pred))) imputer = model.clf.named_steps['imputer'] valid_idx = imputer.transform(np.arange(df_train.shape[1])).astype(np.int) et = model.clf.named_steps['extratreesregressor'] feature_importances = pd.DataFrame(data=et.feature_importances_, index=df_train.columns[valid_idx][0]) feature_importances['counts'] = df_train.count()[valid_idx][0] feature_importances.to_csv('feature_importance.csv')
print " Train et Predict the categorie : ",i reg=Regressor() reg.fit(X_train_scaled, set_Y_train[i]) #### Cross validation #print "Cross validation ...", i #loo = cross_validation.LeaveOneOut(len(y_df)) #loo=10 #scores = cross_validation.cross_val_score(reg, X_train_scaled, set_Y_train[i], scoring='neg_mean_squared_error', cv=loo,) #print "The score mean of cross validation : ", scores.mean() #score_cv_global.append(scores.mean()) if(len(set_X_test[i])>0): X_test_scaled = scaler.transform(set_X_test[i][features_train]) listPred.append( reg.predict(X_test_scaled)) i=i+1 l=0 i=0 while l<len(set_X_test): if(len(set_X_test[l])>0): set_X_test[l]['CSPL_RECEIVED_CALLS'] = listPred[i] i=i+1 l=l+1 #on réassemble les valeurs de prédiction resultPred= pd.concat(set_X_test)