del globals()['unqLikesLIDs'] del globals()['profilesDF'] del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, agrsARR, test_size=1500) myTOL = float(sys.argv[1]) mySVM = LinearSVR(tol=myTOL) #mySVM.fit(likesMAT, agrsARR) mySVM.fit(X_train, y_train) y_pred = mySVM.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("agrs, Linear SVM: ", str(myTOL), " ", myRMSE) # joblib.dump(mySVM, "/Users/jamster/LinearSVM-A-agrs.xz", compress=9) # impSVM = joblib.load("/Users/jamster/LinearSVM-A-agrs.xz")
def benchmark_regression(X_train, X_test, y_train, y_test, epsilon, delta): report = [] # SGDRegressor - Local Differential Privacy for alpha in [0.01, 0.1, 1.0, 10.0, 100.0]: model = SGDRegressor(alpha=alpha, loss='huber', max_iter=1000, tol=1e-3) start = time.time() X_train_ldp, y_train_ldp = make_ldp(X_train, y_train, epsilon, delta, classification=False) model.fit(X_train_ldp, y_train_ldp) report.append({ "type": "bounded", "model": type(model).__name__, "hyperparameters": "alpha=%s" % alpha, "epsilon": epsilon, "accuracy": model.score(X_test, y_test), "time": time.time() - start }) # LinearSVR - Local Differential Privacy for C in [1.0, 10.0, 100.0, 1000.0]: model = LinearSVR(C=C, max_iter=10000) start = time.time() X_train_ldp, y_train_ldp = make_ldp(X_train, y_train, epsilon, delta, classification=False) model.fit(X_train_ldp, y_train_ldp) report.append({ "type": "bounded", "model": type(model).__name__, "hyperparameters": "C=%s" % C, "epsilon": epsilon, "accuracy": model.score(X_test, y_test), "time": time.time() - start }) # RandomForestRegressor - Local Differential Privacy for n_estimators in [10, 50, 100, 1000]: model = RandomForestRegressor(n_estimators=n_estimators) start = time.time() X_train_ldp, y_train_ldp = make_ldp(X_train, y_train, epsilon, delta) model.fit(X_train_ldp, y_train_ldp) report.append({ "type": "bounded", "model": type(model).__name__, "hyperparameters": "n_estimators=%s" % n_estimators, "epsilon": epsilon, "accuracy": model.score(X_test, y_test), "time": time.time() - start }) # LinearRegression - Integrated model = regression.LinearRegression(epsilon=epsilon) start = time.time() model.fit(X_train, y_train) report.append({ "type": "integrated", "model": type(model).__name__, "hyperparameters": "", "epsilon": epsilon, "accuracy": model.score(X_test, y_test), "time": time.time() - start }) # FederatedLearningRegressor - Gradient for epochs in [8, 16, 32]: model = FederatedLearningRegressor(epsilon, delta, epochs=epochs, lr=1e-2) start = time.time() model.fit(X_train, y_train) report.append({ "type": "gradient", "model": type(model).__name__, "hyperparameters": "epochs=%s" % epochs, "epsilon": epsilon, "accuracy": model.score(X_test, y_test), "time": time.time() - start }) return pd.DataFrame(report)
regressor(X_train, y_train, X_test, y_test, ['Street'], model) from sklearn.linear_model import Ridge model = RidgeCV() regressor(X_train, y_train, X_test, y_test, ['Street'], model) from sklearn.svm import SVR model = SVR() regressor(X_train, y_train, X_test, y_test, ['Street'], model) from sklearn.neural_network import MLPRegressor model = MLPRegressor() regressor(X_train, y_train, X_test, y_test, ['Street'], model) from sklearn.svm import LinearSVR model = LinearSVR() regressor(X_train, y_train, X_test, y_test, ['Street'], model) from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() regressor(X_train, y_train, X_test, y_test, ['Street'], model) from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() regressor(X_train, y_train, X_test, y_test, ['Street'], model) from sklearn.linear_model import SGDRegressor model = SGDRegressor() regressor(X_train, y_train, X_test, y_test, ['Street'], model) # get number of categories in variables
print "--------------------------------------------" val_id = fold_ids.ix[:, i].dropna() idx = train["Id"].isin(list(val_id)) trainingSet = train[~idx] validationSet = train[idx] tr_X = np.matrix(trainingSet[feature_names]) tr_Y = np.array(trainingSet["Response"]) val_X = np.matrix(validationSet[feature_names]) val_Y = np.array(validationSet["Response"]) regm = LinearSVR(C=0.06, epsilon=0.45, tol=1e-5, dual=True, verbose=True, random_state=133) regm.fit(tr_X, tr_Y) preds = regm.predict(val_X) df = pd.DataFrame( dict({ "Id": validationSet["Id"], "ground_truth": validationSet["Response"], "linsvr_preds": preds })) linsvr_val = linsvr_val.append(df, ignore_index=True)
df = df.iloc[:2949, :] import pickle df.to_pickle("Final_Data") df.read_pickle("Final_Data") for idx, row in output_df.iterrows(): df.loc[row['FIPS'], 'annual_count_avg'] = row['Average Annual Count'] X = df.loc[:, :'WATR'] y = df['annual_count_avg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) from sklearn.svm import LinearSVR svr = LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train) svr.score(X_test, y_test) from sklearn import svm svm = svm.SVR().fit(X_train, y_train) svm.score(X_test, y_test) from sklearn.svm import NuSVR nuSVR = NuSVR().fit(X_train, y_train) nuSVR.score(X_test, y_test) from sklearn import linear_model ridge = linear_model.Ridge(alpha=0.5).fit(X_train, y_train) ridge.score(X_test, y_test) np.argmax(ridge.coef_)
lgb_params = { 'feature_fraction': 0.75, 'metric': 'rmse', 'nthread': 1, 'min_data_in_leaf': 2**7, 'bagging_fraction': 0.75, 'learning_rate': 0.03, 'objective': 'mse', 'bagging_seed': 2**7, 'num_leaves': 2**7, 'bagging_freq': 1, 'verbose': 0 } svm = LinearSVR(C=1.0, verbose=True) scaler = StandardScaler() train['target'] = train['target'].clip(0, 20) print('--------------- Scaling Features --------------') x_train = train.drop(to_drop, axis=1) x_train = downcast_types(x_train) x_train = scaler.fit_transform(x_train) y_train = train['target'] test.drop(to_drop, axis=1, inplace=True) test = downcast_types(test) test = scaler.transform(test) del train gc.collect()
def StandardLinearSVR(epsilon=0.1): return Pipeline([("std_scaler", StandardScaler()), ("linearSVR", LinearSVR(epsilon=epsilon))])
def default_datasets(carrier, id_airport): # # **Predicting flight delays** # In this notebook, we developed the model aimed at predicting flight delays at take-off. # During the EDA, we intended to create good quality figures # This notebook is composed of three parts: # Cleaning # * Date and Times # * Missing Values # Exploration # * Graphs # * Impact of Departure Vs Arrival Delays # Modeling # The model is developed for one airport and one airline # * Linear # * Ridge # * Random Forest # * Neural Networks # * SVM # In[2]: import datetime, warnings, scipy import pandas as pd import numpy as np import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.patches as patches from matplotlib.patches import ConnectionPatch from collections import OrderedDict from matplotlib.gridspec import GridSpec from sklearn import metrics, linear_model from sklearn.preprocessing import PolynomialFeatures, StandardScaler from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict from scipy.optimize import curve_fit from sklearn.metrics import r2_score from random import sample import matplotlib.patches as mpatches from sklearn.linear_model import Ridge from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score from scipy.stats import spearmanr, pearsonr from sklearn.svm import SVR plt.rcParams["patch.force_edgecolor"] = True plt.style.use('fivethirtyeight') mpl.rc('patch', edgecolor='dimgray', linewidth=1) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "last_expr" pd.options.display.max_columns = 50 #get_ipython().magic('matplotlib inline') warnings.filterwarnings("ignore") # In[2]: df = pd.read_csv( '/Users/sarveshprattipati/Downloads/flight-delays/flights.csv', low_memory=False) print('Dataframe dimensions:', df.shape) airports = pd.read_csv( "/Users/sarveshprattipati/Downloads/flight-delays/airports.csv") airlines_names = pd.read_csv( '/Users/sarveshprattipati/Downloads/flight-delays/airlines.csv') airlines_names abbr_companies = airlines_names.set_index('IATA_CODE')['AIRLINE'].to_dict() carrier = 'AA' id_airport = 'DFW' # %% # # 1. Cleaning # # 1.1 Dates and times # # **YEAR, MONTH, DAY**, is merged into date column df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']]) # Moreover, in the **SCHEDULED_DEPARTURE** variable, the hour of the take-off is coded as a float where the two first digits indicate the hour and the two last, the minutes. This format is not convenient and I thus convert it. Finally, I merge the take-off hour with the flight date. To proceed with these transformations, I define a few functions: # Function that converts the 'HHMM' string to datetime.time def format_heure(chaine): if pd.isnull(chaine): return np.nan else: if chaine == 2400: chaine = 0 chaine = "{0:04d}".format(int(chaine)) heure = datetime.time(int(chaine[0:2]), int(chaine[2:4])) return heure # Function that combines a date and time to produce a datetime.datetime def combine_date_heure(x): if pd.isnull(x[0]) or pd.isnull(x[1]): return np.nan else: return datetime.datetime.combine(x[0], x[1]) # Function that combine two columns of the dataframe to create a datetime format def create_flight_time(df, col): liste = [] for index, cols in df[['DATE', col]].iterrows(): if pd.isnull(cols[1]): liste.append(np.nan) elif float(cols[1]) == 2400: cols[0] += datetime.timedelta(days=1) cols[1] = datetime.time(0, 0) liste.append(combine_date_heure(cols)) else: cols[1] = format_heure(cols[1]) liste.append(combine_date_heure(cols)) return pd.Series(liste) df['SCHEDULED_DEPARTURE'] = create_flight_time(df, 'SCHEDULED_DEPARTURE') df['DEPARTURE_TIME'] = df['DEPARTURE_TIME'].apply(format_heure) df['SCHEDULED_ARRIVAL'] = df['SCHEDULED_ARRIVAL'].apply(format_heure) df['ARRIVAL_TIME'] = df['ARRIVAL_TIME'].apply(format_heure) # __________________________________________________________________________ # df.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME', # 'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']] # The content of the **DEPARTURE_TIME** and **ARRIVAL_TIME** variables can be a bit misleading. # the first entry of the dataframe, the scheduled departure is at 0h05 the 1st of January. # ### 1.2 Filling factor # # Finally, the data frame is cleaned and few columns are dropped variables_to_remove = [ 'TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'DATE', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIR_TIME' ] df.drop(variables_to_remove, axis=1, inplace=True) df = df[[ 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'SCHEDULED_TIME', 'ELAPSED_TIME' ]] # df[:5] missing_df = df.isnull().sum(axis=0).reset_index() missing_df.columns = ['variable', 'missing values'] missing_df['filling factor (%)'] = ( df.shape[0] - missing_df['missing values']) / df.shape[0] * 100 missing_df.sort_values('filling factor (%)').reset_index(drop=True) # The filling factor is quite good (> 97%). So dropping the rows with NA is a good option df.dropna(inplace=True) # %% # # 2. Exploration # # 2.1 Basic statistical description of airlines # function for statistical parameters from a grouby object: def get_stats(group): return { 'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean() } global_stats = df['DEPARTURE_DELAY'].groupby( df['AIRLINE']).apply(get_stats).unstack() global_stats = global_stats.sort_values('count') global_stats # In[15]: # # 2.1 Graphs # Pie chart for font = {'family': 'normal', 'weight': 'bold', 'size': 15} mpl.rc('font', **font) # __________________________________________________________________ # I extract a subset of columns and redefine the airlines labeling df2 = df.loc[:, ['AIRLINE', 'DEPARTURE_DELAY']] df2['AIRLINE'] = df2['AIRLINE'].replace(abbr_companies) # ________________________________________________________________________ colors = [ 'royalblue', 'grey', 'wheat', 'c', 'firebrick', 'seagreen', 'lightskyblue', 'lightcoral', 'yellowgreen', 'gold', 'tomato', 'violet', 'aquamarine', 'chartreuse' ] # ___________________________________ fig = plt.figure(1, figsize=(16, 15)) gs = GridSpec(2, 1) ax1 = fig.add_subplot(gs[0, 0]) ax2 = fig.add_subplot(gs[1, 0]) labels = [s for s in global_stats.index] # ---------------------------------------- # Pie chart for mean delay at departure # ---------------------------------------- sizes = global_stats['mean'].values sizes = [max(s, 0) for s in sizes] explode = [ 0.0 if sizes[i] < 20000 else 0.01 for i in range(len(abbr_companies)) ] patches, texts, autotexts = ax1.pie( sizes, explode=explode, labels=labels, colors=colors, shadow=False, startangle=0, autopct=lambda p: '{:.0f}'.format(p * sum(sizes) / 100)) for i in range(len(abbr_companies)): texts[i].set_fontsize(14) ax1.axis('equal') ax1.set_title('Mean delay at origin', bbox={ 'facecolor': 'midnightblue', 'pad': 5 }, color='w', fontsize=18) # ------------------------------------------------------ # striplot with all the values for the delays # ___________________________________________________________________ # Defining the colors for correspondance with the pie charts colors = [ 'firebrick', 'gold', 'lightcoral', 'aquamarine', 'c', 'yellowgreen', 'grey', 'seagreen', 'tomato', 'violet', 'wheat', 'chartreuse', 'lightskyblue', 'royalblue' ] # ___________________________________________________________________ ax2 = sns.stripplot(y="AIRLINE", x="DEPARTURE_DELAY", size=4, palette=colors, data=df2, linewidth=0.5, jitter=True) plt.setp(ax2.get_xticklabels(), fontsize=14) plt.setp(ax2.get_yticklabels(), fontsize=14) ax2.set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*[int(y) for y in divmod(x, 60)]) for x in ax2.get_xticks() ]) plt.xlabel('Departure delay', fontsize=18, bbox={ 'facecolor': 'midnightblue', 'pad': 5 }, color='w', labelpad=20) ax2.yaxis.label.set_visible(False) # ________________________ plt.tight_layout(w_pad=3) # If we Exclude Hawaiian Airlines and Alaska Airlines, which have low mean delays, the mean delay would be 11 ± 7 minutes # The second graph shows that, incase of mean delay being 11 minutes, there might be hours delay for some flights # In[16]: # # 2.1 Graphs # Function defining how delays are grouped delay_type = lambda x: ((0, 1)[x > 5], 2)[x > 45] df['DELAY_LEVEL'] = df['DEPARTURE_DELAY'].apply(delay_type) fig = plt.figure(1, figsize=(10, 7)) ax = sns.countplot(y="AIRLINE", hue='DELAY_LEVEL', data=df) # We replace the abbreviations by the full names of the companies and set the labels labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()] ax.set_yticklabels(labels) plt.setp(ax.get_xticklabels(), fontsize=12, weight='normal', rotation=0) plt.setp(ax.get_yticklabels(), fontsize=12, weight='bold', rotation=0) ax.yaxis.label.set_visible(False) plt.xlabel('Flight count', fontsize=16, weight='bold', labelpad=10) # Set the legend L = plt.legend() L.get_texts()[0].set_text('on time (t < 5 min)') L.get_texts()[1].set_text('small delay (5 < t < 45 min)') L.get_texts()[2].set_text('large delay (t > 45 min)') plt.show() # %% # # 2.2 Impact of Departure Vs Arrival Delays mpl.rcParams.update(mpl.rcParamsDefault) mpl.rcParams['hatch.linewidth'] = 2.0 fig = plt.figure(1, figsize=(11, 6)) ax = sns.barplot(x="DEPARTURE_DELAY", y="AIRLINE", data=df, color="lightskyblue", ci=None) ax = sns.barplot(x="ARRIVAL_DELAY", y="AIRLINE", data=df, color="r", hatch='///', alpha=0.0, ci=None) labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()] ax.set_yticklabels(labels) ax.yaxis.label.set_visible(False) plt.xlabel('Mean delay [min] (@departure: blue, @arrival: hatch lines)', fontsize=14, weight='bold', labelpad=10) # This figure shows arrival delays are lower than departure delays. # The arrival delays can be compensated during air travel. # So for this project we have estimating the departure delays. # %% # ### 2.2 Vizualization for delays at origin airports airport_mean_delays = pd.DataFrame(pd.Series( df['ORIGIN_AIRPORT'].unique())) airport_mean_delays.set_index(0, drop=True, inplace=True) for carrier in abbr_companies.keys(): df1 = df[df['AIRLINE'] == carrier] test = df1['DEPARTURE_DELAY'].groupby( df['ORIGIN_AIRPORT']).apply(get_stats).unstack() airport_mean_delays[carrier] = test.loc[:, 'mean'] temp_airports = airports identify_airport = temp_airports.set_index('IATA_CODE')['CITY'].to_dict() sns.set(context="paper") fig = plt.figure(1, figsize=(8, 8)) ax = fig.add_subplot(1, 2, 1) subset = airport_mean_delays.iloc[:50, :].rename(columns=abbr_companies) subset = subset.rename(index=identify_airport) mask = subset.isnull() sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin=0, vmax=35) plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85) ax.yaxis.label.set_visible(False) ax = fig.add_subplot(1, 2, 2) subset = airport_mean_delays.iloc[50:100, :].rename(columns=abbr_companies) subset = subset.rename(index=identify_airport) fig.text(0.5, 1.02, "Delays: impact of the origin airport", ha='center', fontsize=18) mask = subset.isnull() sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin=0, vmax=35) plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85) ax.yaxis.label.set_visible(False) plt.tight_layout() # From the above graph, we deduce # American eagle has large delays # Delta airlines has delays less than 5 minutes # Few airports favour late departure,like Denver, Chicago # In[32]: # Common class for graphs class Figure_style(): # _________________________________________________________________ def __init__(self, size_x=11, size_y=5, nrows=1, ncols=1): sns.set_style("white") sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5}) self.fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=( size_x, size_y, )) # ________________________________ # convert self.axs to 2D array if nrows == 1 and ncols == 1: self.axs = np.reshape(axs, (1, -1)) elif nrows == 1: self.axs = np.reshape(axs, (1, -1)) elif ncols == 1: self.axs = np.reshape(axs, (-1, 1)) # _____________________________ def pos_update(self, ix, iy): self.ix, self.iy = ix, iy # _______________ def style(self): self.axs[self.ix, self.iy].spines['right'].set_visible(False) self.axs[self.ix, self.iy].spines['top'].set_visible(False) self.axs[self.ix, self.iy].yaxis.grid(color='lightgray', linestyle=':') self.axs[self.ix, self.iy].xaxis.grid(color='lightgray', linestyle=':') self.axs[self.ix, self.iy].tick_params(axis='both', which='major', labelsize=10, size=5) # ________________________________________ def draw_legend(self, location='upper right'): legend = self.axs[self.ix, self.iy].legend(loc=location, shadow=True, facecolor='g', frameon=True) legend.get_frame().set_facecolor('whitesmoke') # _________________________________________________________________________________ def cust_plot(self, x, y, color='b', linestyle='-', linewidth=1, marker=None, label=''): if marker: markerfacecolor, marker, markersize = marker[:] self.axs[self.ix, self.iy].plot(x, y, color=color, linestyle=linestyle, linewidth=linewidth, marker=marker, label=label, markerfacecolor=markerfacecolor, markersize=markersize) else: self.axs[self.ix, self.iy].plot(x, y, color=color, linestyle=linestyle, linewidth=linewidth, label=label) self.fig.autofmt_xdate() # ________________________________________________________________________ def cust_plot_date(self, x, y, color='lightblue', linestyle='-', linewidth=1, markeredge=False, label=''): markeredgewidth = 1 if markeredge else 0 self.axs[self.ix, self.iy].plot_date(x, y, color='lightblue', markeredgecolor='grey', markeredgewidth=markeredgewidth, label=label) # ________________________________________________________________________ def cust_scatter(self, x, y, color='lightblue', markeredge=False, label=''): markeredgewidth = 1 if markeredge else 0 self.axs[self.ix, self.iy].scatter(x, y, color=color, edgecolor='grey', linewidths=markeredgewidth, label=label) # def set_xlabel(self, label, fontsize=14): self.axs[self.ix, self.iy].set_xlabel(label, fontsize=fontsize) def set_ylabel(self, label, fontsize=14): self.axs[self.ix, self.iy].set_ylabel(label, fontsize=fontsize) # ____________________________________ def set_xlim(self, lim_inf, lim_sup): self.axs[self.ix, self.iy].set_xlim([lim_inf, lim_sup]) # ____________________________________ def set_ylim(self, lim_inf, lim_sup): self.axs[self.ix, self.iy].set_ylim([lim_inf, lim_sup]) # Sampling the data with 80:20 training and test data set df_train = df.sample(frac=0.8) df_test = df.loc[~df.index.isin(df_train.index)] df = df_train # In[37]: # Defining dataframe creation function ########################################################################### def get_flight_delays(df, carrier, id_airport, extrem_values=False): df2 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)] # _______________________________________ # remove extreme values before fitting if extrem_values: df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply( lambda x: x if x < 60 else np.nan) df2.dropna(how='any') # __________________________________ df2.sort_values('SCHEDULED_DEPARTURE', inplace=True) df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply( lambda x: x.time()) # ___________________________________________________________________ test2 = df2['DEPARTURE_DELAY'].groupby( df2['schedule_depart']).apply(get_stats).unstack() test2.reset_index(inplace=True) # ___________________________________ fct = lambda x: x.hour * 60 + x.minute test2.reset_index(inplace=True) test2['schedule_depart_mnts'] = test2['schedule_depart'].apply(fct) return test2 def create_df(df, carrier, id_airport, extrem_values=False): df2 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)] df2.dropna(how='any', inplace=True) df2['weekday'] = df2['SCHEDULED_DEPARTURE'].apply( lambda x: x.weekday()) # ____________________ # delete delays > 1h df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply( lambda x: x if x < 60 else np.nan) df2.dropna(how='any', inplace=True) # _________________ # formating times fct = lambda x: x.hour * 60 + x.minute df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply( lambda x: x.time()) df2['schedule_depart_mnts'] = df2['schedule_depart'].apply(fct) df2['schedule_arrivee'] = df2['SCHEDULED_ARRIVAL'].apply(fct) df3 = df2.groupby(['schedule_depart_mnts', 'schedule_arrivee'], as_index=False).mean() return df3 # # In[39]: # Linear Regression ####### Linear_Train ####### test2 = get_flight_delays(df, carrier, id_airport, False) test2.to_csv('Model_dataset.csv', sep=',') test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_L_train = np.array(test['schedule_depart_mnts']) Y_L_train = np.array(test['mean']) X_L_train = X_L_train.reshape(len(X_L_train), 1) Y_L_train = Y_L_train.reshape(len(Y_L_train), 1) regr = linear_model.LinearRegression() regr.fit(X_L_train, Y_L_train) result_L_train = regr.predict(X_L_train) score_L_train = regr.score(X_L_train, Y_L_train) # print("R^2 for Linear Train= ",score_L_train) print("MSE Linear Train=", metrics.mean_squared_error(result_L_train, Y_L_train)) # The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares # ((y_true - y_pred) ** 2).sum() and v is the # total sum of squares ((y_true - y_true.mean()) ** 2).sum(). ####### Linear_Test ####### test2 = get_flight_delays(df_test, carrier, id_airport, False) test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_L_test = np.array(test['schedule_depart_mnts']) Y_L_test = np.array(test['mean']) X_L_test = X_L_test.reshape(len(X_L_test), 1) Y_L_test = Y_L_test.reshape(len(Y_L_test), 1) result_L_test = regr.predict(X_L_test) score_L_test = regr.score(X_L_test, Y_L_test) # print("R^2 for Linear Test= ",score_L_test) print("MSE Linear Test=", metrics.mean_squared_error(result_L_test, Y_L_test)) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_L_test, Y_L_test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_L_test, result_L_test, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # In[77]: # Ridge Regression ####### Ridge_Training ####### df3 = get_flight_delays(df, carrier, id_airport) df3[:5] # df1 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)] # df1['heure_depart'] = df1['SCHEDULED_DEPARTURE'].apply(lambda x:x.time()) # df1['heure_depart'] = df1['heure_depart'].apply(lambda x:x.hour*60+x.minute) df3 = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X = np.array(df3['schedule_depart_mnts']) Y = np.array(df3['mean']) X = X.reshape(len(X), 1) Y = Y.reshape(len(Y), 1) parameters = [0.2, 1] ridgereg = Ridge(alpha=parameters[0], normalize=True) poly = PolynomialFeatures(degree=parameters[1]) X_ = poly.fit_transform(X) ridgereg.fit(X_, Y) result_R_train = ridgereg.predict(X_) score_R_train = metrics.mean_squared_error(result_R_train, Y) r2_R_train = regr.score(X, Y) # print("R^2 for Ridge Train:",r2_R_train ) print('MSE Ridge Train= {}'.format(round(score_R_train, 2))) ####### Ridge_Test ####### df3 = get_flight_delays(df_test, carrier, id_airport) df3[:5] test = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_L_test = np.array(test['schedule_depart_mnts']) Y_L_test = np.array(test['mean']) X_testt = X.reshape(len(X), 1) Y_testt = Y.reshape(len(Y), 1) X_ = poly.fit_transform(X_testt) result_test = ridgereg.predict(X_) score_R_test = metrics.mean_squared_error(result_test, Y_testt) r2_ridge_test = r2_score(X_testt, Y_testt) # print("R^2 for Ridge Test is: ",r2_ridge_test ) print('MSE Ridge Test = {}'.format(round(np.sqrt(score_R_test), 2))) # 'Ecart = {:.2f} min'.format(np.sqrt(score_R_test)) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_testt, Y_testt, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_testt, result_test, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # %% ########################################################################### ####### Random Forest_Train ####### df4 = create_df(df, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df4 = df4[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_rf_Train = np.array(df4['schedule_depart_mnts']) Y_rf_Train = np.array(df4['DEPARTURE_DELAY']) X_rf_Train = X_rf_Train.reshape(len(X_rf_Train), 1) Y_rf_Train = Y_rf_Train.reshape(len(Y_rf_Train), 1) rf = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=123456) rf.fit(X_rf_Train, Y_rf_Train) predicted_train = rf.predict(X_rf_Train) test_score = r2_score(Y_rf_Train, predicted_train) spearman = spearmanr(Y_rf_Train, predicted_train) # pearson = pearsonr(Y_rf_Train, predicted_train) # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}') # print(f'Test data R-2 score: {test_score:>5.3}') # print(f'Test data Spearman correlation: {spearman[0]:.3}') # print("R^2 for RF Train:",test_score ) print('MSE RF Train= {}'.format( round(metrics.mean_squared_error(predicted_train, Y_rf_Train), 2))) # print(f'Test data Pearson correlation: {pearson[0]:.3}') ####### Random Forest_Test ####### df41 = create_df(df_test, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df41 = df41[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_rf_Test = np.array(df41['schedule_depart_mnts']) Y_rf_Test = np.array(df41['DEPARTURE_DELAY']) X_rf_Test = X_rf_Test.reshape(len(X_rf_Test), 1) Y_rf_Test = Y_rf_Test.reshape(len(Y_rf_Test), 1) predicted_test = rf.predict(X_rf_Test) test_score = r2_score(Y_rf_Test, predicted_test) spearman = spearmanr(Y_rf_Test, predicted_test) # pearson = pearsonr(Y_rf_Train, predicted_train) # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}') # print(f'Test data R-2 score: {test_score:>5.3}') # print(f'Test data Spearman correlation: {spearman[0]:.3}') score_rf_test = r2_score(X_rf_Test, Y_rf_Test) # print("R^2 for RF Test: ",score_rf_test ) score_RF_test = metrics.mean_squared_error(predicted_test, Y_rf_Test) print(' MSE RF Test = {}'.format(round(score_RF_test, 2))) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_rf_Test, Y_rf_Test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_rf_Test, predicted_test, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # %% ########################################################################### ####### Neural Network_Train ####### df5 = create_df(df, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df5 = df5[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_nn_Train = np.array(df5['schedule_depart_mnts']) Y_nn_Train = np.array(df5['DEPARTURE_DELAY']) X_nn_Train = X_nn_Train.reshape(len(X_nn_Train), 1) Y_nn_Train = Y_nn_Train.reshape(len(Y_nn_Train), 1) regr = LinearSVR(random_state=0) # from sknn.mlp import Classifier, Layer # #regr = LinearSVR(random_state=0) # regr = Classifier( # layers=[ # Layer("Rectifier", units=10), # Layer("Linear")], # learning_rate=0.02, # n_iter=5) regr.fit(X_nn_Train, Y_nn_Train) predict_train_NN = regr.predict(X_nn_Train) r2_NN_train = r2_score(Y_nn_Train, predict_train_NN) # print("R^2 for NN Train:",r2_NN_train ) print('MSE NN Train= {}'.format( round(metrics.mean_squared_error(predict_train_NN, Y_nn_Train), 2))) ####### Neural Network_Test ####### df51 = create_df(df_test, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df51 = df51[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_NN_Test = np.array(df51['schedule_depart_mnts']) Y_NN_Test = np.array(df51['DEPARTURE_DELAY']) X_NN_Test = X_NN_Test.reshape(len(X_NN_Test), 1) Y_NN_Test = Y_NN_Test.reshape(len(Y_NN_Test), 1) predict_test_NN = regr.predict(X_NN_Test) score_NN_test = r2_score(X_NN_Test, Y_NN_Test) # print("R^2 for NN Test: ",score_NN_test ) MSE_NN_test = metrics.mean_squared_error(predict_test_NN, Y_NN_Test) print('MSE NN Test = {}'.format(round(MSE_NN_test, 2))) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_NN_Test, Y_NN_Test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_NN_Test, predict_test_NN, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # %% ########################################################################### ####### SVM_Train ####### df6 = create_df(df, carrier, id_airport) df6 = df6[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_svm_Train = np.array(df6['schedule_depart_mnts']) Y_svm_Train = np.array(df6['DEPARTURE_DELAY']) X_svm_Train = X_svm_Train.reshape(len(X_svm_Train), 1) Y_svm_Train = Y_svm_Train.reshape(len(Y_svm_Train), 1) regr = SVR(kernel='linear') regr.fit(X_svm_Train, Y_svm_Train) predict_train_svm = regr.predict(X_svm_Train) r2_svm_train = r2_score(Y_nn_Train, predict_train_svm) # print("R^2 for svm Train:",r2_svm_train ) print('MSE svm Train= {}'.format( round(metrics.mean_squared_error(predict_train_svm, Y_svm_Train), 2))) ####### SVM_Test ####### df61 = create_df(df_test, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df61 = df61[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_svm_Test = np.array(df61['schedule_depart_mnts']) Y_svm_Test = np.array(df61['DEPARTURE_DELAY']) X_svm_Test = X_svm_Test.reshape(len(X_svm_Test), 1) Y_svm_Test = Y_svm_Test.reshape(len(Y_svm_Test), 1) predict_test_svm = regr.predict(X_svm_Test) r2_svm_test = r2_score(X_svm_Test, Y_svm_Test) # print("R^2 for svm Test: ",r2_svm_test ) mse_svm_test = metrics.mean_squared_error(predict_test_svm, Y_svm_Test) print('MSE svm Test= {}'.format(round(mse_svm_test, 2))) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_svm_Test, Y_svm_Test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_svm_Test, predict_test_svm, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) return np.mean(result_L_test), np.mean(result_test), np.mean( predicted_test), np.mean(predict_test_NN), np.mean(predict_test_svm)
def linear_svr(dataframe, target=None, drop_features=[], without_outliers=False, split=0.2): warnings.filterwarnings("ignore", category=ConvergenceWarning, message="^Liblinear failed to converge") # Remove non-numerical and undesired features from dataframe dataframe = dataframe.loc[:, dataframe.dtypes != 'object'] dataframe = dataframe.drop(drop_features, axis=1) # Transform data into columns and define target variable numerical_features = dataframe.loc[:, dataframe.columns != target] X = np.nan_to_num( numerical_features.to_numpy()) # .reshape(numerical_features.shape) y = np.nan_to_num(dataframe[target].to_numpy() ) # .reshape(dataframe[target].shape[0], 1) # Split the data into training/testing sets testsplit = round(split * X.shape[0]) X_train = X[:-testsplit] X_test = X[-testsplit:] y_train = y[:-testsplit] y_test = y[-testsplit:] # Train linear regression model reg = LinearSVR(random_state=0, tol=1e-5) reg.fit(X_train, y_train) feature_importance = pd.Series( reg.coef_[0], index=numerical_features.columns) # only with linear kernel # Prediction with trained model y_pred = reg.predict(X_test) results = pd.DataFrame() results['Train mean'] = np.mean(y_train) results['Train std'] = np.std(y_train) results['Test mean'] = np.mean(y_test) results['Test std'] = np.std(y_test) results['Prediction mean'] = np.mean(y_pred) results['Prediction std'] = np.std(y_pred) results['Mean Squared Error'] = mean_squared_error(y_test, y_pred) results['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred) results['R2 score'] = r2_score(y_test, y_pred) results['Explained variance score'] = explained_variance_score( y_test, y_pred) results['Cross-val R2 score (mean)'] = np.mean( cross_val_score(reg, X, y, cv=10, scoring="r2")) results['Cross-val R2 scores'] = cross_val_score(reg, X, y, cv=10, scoring="r2") results['Cross-val explained_variance score (mean)'] = np.mean( cross_val_score(reg, X, y, cv=10, scoring="explained_variance")) results['Cross-val explained_variance scores'] = cross_val_score( reg, X, y, cv=10, scoring="explained_variance") y_result = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred}) return feature_importance, results, y_result, reg
def run_ensemble_model(X, Y, modeltype='Regression', scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ seed = 99 if len(X) <= 100000 or X.shape[1] < 50: NUMS = 50 FOLDS = 3 else: NUMS = 20 FOLDS = 5 ## create Voting models estimators = [] if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = ShuffleSplit(n_splits=FOLDS, random_state=seed) model5 = LinearRegression() results1 = cross_val_score(model5, X, Y, cv=scv, scoring=scoring) estimators.append( ('Linear Model', model5, np.sqrt(abs(results1.mean())))) model6 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( min_samples_leaf=2, max_depth=1, random_state=seed), n_estimators=NUMS, random_state=seed) results2 = cross_val_score(model6, X, Y, cv=scv, scoring=scoring) estimators.append(('Boosting', model6, np.sqrt(abs(results2.mean())))) model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv) results3 = cross_val_score(model7, X, Y, cv=scv, scoring=scoring) estimators.append( ('Linear Regularization', model7, np.sqrt(abs(results3.mean())))) ## Create an ensemble model #### # estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] # unused ensemble = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results4 = cross_val_score(ensemble, X, Y, cv=scv, scoring=scoring) estimators.append(('Bagging', ensemble, np.sqrt(abs(results4.mean())))) if verbose == 1: print( '\nLinear Model = %0.4f \nBoosting = %0.4f\nRegularization = %0.4f \nBagging = %0.4f' % (np.sqrt(abs(results1.mean())) / Y.std(), np.sqrt(abs(results2.mean())) / Y.std(), np.sqrt(abs(results3.mean())) / Y.std(), np.sqrt(abs(results4.mean())) / Y.std())) besttype = sorted(estimators, key=lambda x: x[2], reverse=False)[0][0] bestmodel = sorted(estimators, key=lambda x: x[2], reverse=False)[0][1] bestscore = sorted(estimators, key=lambda x: x[2], reverse=False)[0][2] / Y.std() if verbose == 1: print(' Best Model = %s with %0.2f Normalized RMSE score\n' % (besttype, bestscore)) elif modeltype == 'TimeSeries' or modeltype == 'Time Series' or modeltype == 'Time_Series': #### This section is for Time Series Models only #### if scoring == '': scoring = 'neg_mean_squared_error' tscv = TimeSeriesSplit(n_splits=FOLDS) scoring = 'neg_mean_squared_error' model5 = SVR(C=0.1, kernel='rbf', degree=2) results1 = cross_val_score(model5, X, Y, cv=tscv, scoring=scoring) estimators.append(('SVR', model5, np.sqrt(abs(results1.mean())))) model6 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( min_samples_leaf=2, max_depth=1, random_state=seed), n_estimators=NUMS, random_state=seed) results2 = cross_val_score(model6, X, Y, cv=tscv, scoring=scoring) estimators.append( ('Extra Trees', model6, np.sqrt(abs(results2.mean())))) model7 = LinearSVR(random_state=seed) results3 = cross_val_score(model7, X, Y, cv=tscv, scoring=scoring) estimators.append(('LinearSVR', model7, np.sqrt(abs(results3.mean())))) ## Create an ensemble model #### # estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] # unused ensemble = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results4 = cross_val_score(ensemble, X, Y, cv=tscv, scoring=scoring) estimators.append(('Bagging', ensemble, np.sqrt(abs(results4.mean())))) print('Running multiple models...') if verbose == 1: print( ' Instance Based = %0.4f \n Boosting = %0.4f\n Linear Model = %0.4f \n Bagging = %0.4f' % (np.sqrt(abs(results1.mean())) / Y.std(), np.sqrt(abs(results2.mean())) / Y.std(), np.sqrt(abs(results3.mean())) / Y.std(), np.sqrt(abs(results4.mean())) / Y.std())) besttype = sorted(estimators, key=lambda x: x[2], reverse=False)[0][0] bestmodel = sorted(estimators, key=lambda x: x[2], reverse=False)[0][1] bestscore = sorted(estimators, key=lambda x: x[2], reverse=False)[0][2] / Y.std() if verbose == 1: print('Best Model = %s with %0.2f Normalized RMSE score\n' % (besttype, bestscore)) print('Model Results:') else: if scoring == '': scoring = 'f1' scv = StratifiedShuffleSplit(n_splits=FOLDS, random_state=seed) model5 = LogisticRegression(random_state=seed) results1 = cross_val_score(model5, X, Y, cv=scv, scoring=scoring) estimators.append( ('Logistic Regression', model5, abs(results1.mean()))) model6 = LinearDiscriminantAnalysis() results2 = cross_val_score(model6, X, Y, cv=scv, scoring=scoring) estimators.append( ('Linear Discriminant', model6, abs(results2.mean()))) model7 = ExtraTreesClassifier(n_estimators=NUMS, min_samples_leaf=2, random_state=seed) results3 = cross_val_score(model7, X, Y, cv=scv, scoring=scoring) estimators.append(('Bagging', model7, abs(results3.mean()))) ## Create an ensemble model #### # estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] # unused ensemble = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) results4 = cross_val_score(ensemble, X, Y, cv=scv, scoring=scoring) estimators.append(('Boosting', ensemble, abs(results4.mean()))) if verbose == 1: print( '\nLogistic Regression = %0.4f \nLinear Discriminant = %0.4f \nBagging = %0.4f \nBoosting = %0.4f' % (abs(results1.mean()), abs(results2.mean()), abs(results3.mean()), abs(results4.mean()))) besttype = sorted(estimators, key=lambda x: x[2], reverse=True)[0][0] bestmodel = sorted(estimators, key=lambda x: x[2], reverse=True)[0][1] bestscore = sorted(estimators, key=lambda x: x[2], reverse=True)[0][2] if verbose == 1: print(' Best Model = %s with %0.2f %s score\n' % (besttype, bestscore, scoring)) return bestmodel, bestscore, besttype
def trainModel(param, data, features, feature): #we just judge our model #so we do not use bagging ,just one loop of CV train_feature = features pred_label = feature feature_valid = ['Ret_PlusOne', 'Ret_PlusTwo', 'Weight_Daily'] #create CV err_cv = [] std_cv = [] for run in range(0, 3): print "this is run:%d" % (run + 1) train_index = loadCVIndex("../../data/cv/train.run%d.txt" % (run + 1)) test_index = loadCVIndex("../../data/cv/valid.run%d.txt" % (run + 1)) error_data = data.iloc[test_index] X_train = data.iloc[train_index][train_feature] X_test = data.iloc[test_index][train_feature] Y_train = data.iloc[train_index][pred_label] Y_test = data.iloc[test_index][pred_label] if param['task'] == 'skl_ridge': ridge = Ridge(alpha=param['alpha'], normalize=True) ridge.fit(X_train, Y_train) pred_value = ridge.predict(X_test) pd.DataFrame(ridge.coef_, columns=train_feature).to_csv("ridge.csv") pred_value = pd.DataFrame(pred_value, columns=['1', '2']) train_data = data.iloc[test_index] print train_data.shape error_train = Ret_Plus_error( pred_value, train_data[feature_valid]) / (40000 * 0.7 * 62) print error_train variance = 0 err_cv.append(error_train) std_cv.append(variance) elif param['task'] == 'skl_lasso': lasso = Lasso(alpha=param['alpha'], normalize=True, fit_intercept=True, tol=0.00000000001) lasso.fit(X_train, Y_train) pred_value = lasso.predict(X_test) pred_value = pd.DataFrame(pred_value, columns=['1', '2']) train_data = data.iloc[test_index] error_train = Ret_Plus_error(pred_value, train_data[feature_valid]) print error_train variance = 0 err_cv.append(error_train) std_cv.append(variance) elif param['task'] == 'skl_lr': clf = LogisticRegression(C=param['C']) clf.fit(X_train, Y_train) pred_value = clf.predict(X_test) error_train = 1 - accuracy_model(pred_value, Y_test) variance = error_train err_cv.append(error_train) std_cv.append(variance) elif param['task'] == 'regression': train_data = xgb.DMatrix(X_train, label=np.array(Y_train)) valid_data = xgb.DMatrix(X_test, label=np.array(Y_test)) watchlist = [(train_data, 'train'), (valid_data, 'valid')] bst = xgb.train(param, train_data, int(param['num_round']), watchlist) valid_data = xgb.DMatrix(X_test) pred_value = bst.predict(valid_data) tmp_data = error_data[feature_valid] for feat in pred_label: print tmp_data.shape print pred_value.shape error_train = Ret_Plus_error_xgb( tmp_data, feat, list(pred_value)) / (40000 * 0.3 * 62) variance = 0 err_cv.append(error_train) std_cv.append(variance) print error_train elif param['task'] == 'class': train_data = xgb.DMatrix(X_train, label=Y_train) valid_data = xgb.DMatrix(X_test, label=Y_test) watchlist = [(train_data, 'train'), (valid_data, 'valid')] bst = xgb.train(param, train_data, int(param['num_round']), watchlist) valid_data = xgb.DMatrix(X_test) pred_value = bst.predict(valid_data) error_train = 1 - accuracy_model(pred_value, Y_test) variance = 0 err_cv.append(error_train) std_cv.append(variance) print error_train elif param['task'] == 'skl_LibSVM': svr = SVR(epsilon=param['epsilon'], tol=param['tol'], cache_size=param['cache_size'], gamma=param['gamma']) svr.fit(X_train, Y_train['Ret_PlusOne']) pred_value1 = svr.predict(X_test) svr.fit(X_train, Y_train['Ret_PlusTwo']) pred_value2 = svr.predict(X_test) if param['kernel'] == 'linear': pd.DataFrame(svr.coef_, columns=train_feature).to_csv("svr.csv") pred_value = pd.DataFrame({'1': pred_value1, '2': pred_value2}) train_data = data.iloc[test_index] error_train = Ret_Plus_error(pred_value, train_data[feature_valid]) print error_train / (40000 * 0.3 * 62) variance = 0 err_cv.append(error_train) std_cv.append(variance) elif param['task'] == 'skl_linearSVR': print param['epsilon'] print param['C'] svr = LinearSVR(C=param['C'], epsilon=param['epsilon'], dual=param['dual'], loss=param['loss'], random_state=param['seed']) svr.fit(X_train, Y_train['Ret_PlusOne']) pred_value1 = svr.predict(X_test) svr.fit(X_train, Y_train['Ret_PlusTwo']) pred_value2 = svr.predict(X_test) pred_value = pd.DataFrame({'1': pred_value1, '2': pred_value2}) train_data = data.iloc[test_index] error_train = Ret_Plus_error(pred_value, train_data[feature_valid]) print error_train / (40000 * 0.3 * 62) variance = 0 err_cv.append(error_train) std_cv.append(variance) #print "error.train:%f error.test:%f"%(error_train,error) error = np.mean(err_cv) std_cv = np.mean(err_cv) print "error:%f" % (error) return { 'loss': error, 'attachments': { 'std': variance }, 'status': STATUS_OK }
print('Header Test Rows') print(datatest.head()) #dictvalues ={} #for coldata in data_new.columns: # dictvalues[coldata] = datatest[coldata].mean() #print('dictvalues values') #print(dictvalues) ##print('sorted output') #from operator import itemgetter #print(sorted(dictvalues.items(), key=itemgetter(1),reverse=True)) #regr = linear_model.Lasso(alpha=0.1) regr = LinearSVR(C=1.0, epsilon=0.2) #regr = RandomForestRegressor() #regr = AdaBoostRegressor(n_estimators=80) regr.fit(data_new[features], y) predictions = regr.predict(datatest) print('predictions') print(predictions) datatest_result = pd.read_csv('test.csv',header=0) datatest_result['loss'] = np.exp(predictions) header = ["id","loss"] datatest_result.to_csv("Results_AllState_SVR_81.csv", sep=',', columns = header,index=False) for col in data.columns[:-1]: print(data[col].unique())
def _training_results(data_dict, split_test, k_fold, training_type_list): """Execute supervised training for each training algorithm type. Options: - split_test: decimal number percentages (recommended 0.1 to 0.3) - k_fold: integer number (recommended 5 or 10) - training_type_list: algorithms options ['logistic_regression', 'decision_tree', 'svm_svc_linear', 'svm_svc_rbf', 'svm_linear_svr ,'multinomial_nb', 'random-forest', 'kneighbors', 'stochastic-gradient-descent-log', 'stochastic-gradient-descent-svm'] - [OUTPUT] final_results: e.g. - {'training_test': [{'name': 'Logistic Regression', 'accuracy': 0.9531331', 'classification_report': ' precision recall f1-score support 0.0 0.95 0.95 0.95 4449 1.0 0.95 0.95 0.95 4449 avg / total 0.95 0.95 0.95 8898', 'confusion_matrix': ' [[4233 216] [ 229 4220]]' }], {'cross_validation': [{'name': 'Logistic Regression', 'accuracy': 0.9531331', ...}]} :param data_dict: :param split_test: :param k_fold: :param training_type_list: :return [object] final results for each methodology: """ final_results = {'training_test': [], 'cross_validation': []} for training_type in training_type_list: result_dict = { 'name': '', 'accuracy': None, 'classification_report': None, 'confusion_matrix': None } training_results = [] if training_type is not None: if training_type == 'logistic_regression': result_dict['name'] = 'Logistic Regression' model = LogisticRegression() elif training_type == 'decision_tree': result_dict['name'] = 'Decision Tree' model = DecisionTreeClassifier() elif training_type == 'svm_svc_linear': result_dict['name'] = 'SVM SVC Linear' model = SVC(kernel='linear', C=C, verbose=True) elif training_type == 'svm_svc_rbf': result_dict['name'] = 'SVM SVC RBF' model = SVC(kernel='rbf', C=C, verbose=True) elif training_type == 'svm_linear_svr': result_dict['name'] = 'SVM Linear SVR' model = LinearSVR(C=C, verbose=True) elif training_type == 'multinomial_nb': result_dict['name'] = 'Multinomial Naive Bayes' model = MultinomialNB() elif training_type == 'random-forest': result_dict['name'] = 'Random Forest' model = RandomForestClassifier() elif training_type == 'kneighbors': result_dict['name'] = 'KNN' model = KNeighborsClassifier(n_neighbors=num_neighbors) elif training_type == 'stochastic-gradient-descent-log': result_dict[ 'name'] = 'Stochastic Gradient Descent - Logistic Regression' model = SGDClassifier(loss='log') elif training_type == 'stochastic-gradient-descent-svm': result_dict[ 'name'] = 'Stochastic Gradient Descent - Linear SVM' model = SGDClassifier(loss='hinge') training_results = _process_training(data_dict, result_dict, model, split_test, k_fold) else: print 'ML not implemented for ' + training_type if training_results['training_test'] is not None: final_results['training_test'].append( training_results['training_test']) elif training_results['cross_validation'] is not None: final_results['cross_validation'].append( training_results['cross_validation']) return final_results
ridge = TestModel(Ridge(), df_f.drop('timestamp', inplace=False, axis=1), df_t.drop('timestamp', inplace=False, axis=1)) #%% ada = TestModel(AdaBoostRegressor(random_state=6), df_f.drop('timestamp', inplace=False, axis=1), df_t['B_C2H6']) #%% svr = TestModel(SVR(), df_f.drop('timestamp', inplace=False, axis=1), df_t['B_C2H6']) #%% lsvr = TestModel(LinearSVR(), df_f.drop('timestamp', inplace=False, axis=1), df_t['B_C2H6']) #%% #isotonic = TestModel(IsotonicRegression(), df_f.drop('timestamp', inplace=False, axis=1), df_t.drop('timestamp', inplace=False, axis=1)) #%% df_test = pd.read_csv('test_features.csv', parse_dates=['timestamp']) buildDF(df_test) model_lasso = Lasso(random_state=6).fit( df_f.drop('timestamp', inplace=False, axis=1), df_t.drop('timestamp', inplace=False, axis=1)) df_pred_lasso = model_lasso.predict(
# Tuning models and test for all features # Linear Regression linreg = LinearRegression() linreg.fit(X_train, y_train) acc_model(0,linreg,X_train,X_test) print("Done") # Support Vector Machines svr = SVR() svr.fit(X_train, y_train) acc_model(1,svr,X_train,X_test) print("Done") # Linear SVR linear_svr = LinearSVR() linear_svr.fit(X_train, y_train) acc_model(2,linear_svr,X_train,X_test) print("Done") # MLPRegressor mlp = MLPRegressor() param_grid = {'hidden_layer_sizes': [i for i in range(2,20)], 'activation': ['relu'], 'solver': ['adam'], 'learning_rate': ['constant'], 'learning_rate_init': [0.01], 'power_t': [0.5], 'alpha': [0.0001], 'max_iter': [1000], 'early_stopping': [True],
def run_train_all_sklearn(file, fp_name, cv=5, verbose=0, seed=1): np.random.seed(seed) c = defaultdict(list) for k in ProgIter([ 'synergy_zip', 'synergy_bliss', 'synergy_loewe', 'synergy_hsa', 'css_ri', 'name' ], verbose=verbose, total=5): v = file[k] if k != 'name': temp = dict( ) # for results storage. Assuming that "name" comes last if 'drug_row_col' in v.columns: v.drop(columns=['drug_row_col'], inplace=True) cat_cols = ['cell_line_name'] categories = [ v[column].unique() for column in v[cat_cols] ] # manually find all available categories for one-hot # pipelines encode = Pipeline(steps=[('one-hot-encode', OneHotEncoder(categories=categories))]) processor = ColumnTransformer(transformers=[ ('cat_encoding', encode, cat_cols), ('dropping', 'drop', [k]) ], remainder='passthrough') catbst = ColumnTransformer(transformers=[('dropping', 'drop', [k]) ], remainder='passthrough') # regressions lr = make_pipeline(processor, linear_model.LinearRegression()) ridge = make_pipeline(processor, linear_model.Ridge()) lasso = make_pipeline(processor, linear_model.Lasso()) elastic = make_pipeline(processor, linear_model.ElasticNet()) lassolars = make_pipeline(processor, linear_model.LassoLars()) b_ridge = make_pipeline(processor, linear_model.BayesianRidge()) kernel = DotProduct() + WhiteKernel() gpr = make_pipeline(processor, GaussianProcessRegressor(kernel=kernel)) linSVR = make_pipeline(processor, LinearSVR()) hist_gbr = make_pipeline( processor, HistGradientBoostingRegressor(warm_start=True, max_depth=6)) rfr = make_pipeline( processor, RandomForestRegressor(warm_start=True, max_depth=6, n_jobs=3)) iso = make_pipeline(processor, IsotonicRegression(increasing='auto')) xgb = make_pipeline( processor, XGBRegressor(tree_method='gpu_hist', max_depth=6)) cbt = make_pipeline( catbst, CatBoostRegressor(task_type='GPU', depth=6, cat_features=np.array([0]), verbose=False)) mls = [ cbt, rfr, gpr, hist_gbr, lr, ridge, lasso, elastic, lassolars, b_ridge, gpr, linSVR, iso ] mls_names = [ "cbt", "rfr", "gpr", "hist_gbr", "lr", "ridge", "lasso", "elastic", "lassolars", "b_ridge", "gpr", "linSVR", "iso" ] # results start = time.time() for MODEL, name in zip(mls, mls_names): print(f'\n{name}') if 'cbt' == name: n_jobs = 1 else: n_jobs = cv cv_dict = cross_validate( MODEL, v, v[k], cv=cv, scoring={ "pearsonr": pearson, "rmse": rmse }, return_train_score=False, verbose=verbose, n_jobs=n_jobs, ) temp[name] = { 'test_pearsonr': np.nanmean(cv_dict['test_pearsonr']), 'test_rmse': abs(np.nanmean(cv_dict['test_rmse'])) } print(temp[name]) print(f'{k} took {int(time.time()-start)/60} mins') c[k] = temp else: nm = f'/tf/notebooks/code_for_pub/_logs_as_python_files/{fp_name}_13models_5foldCV_{time.ctime()}.pickle' with open(nm, 'wb') as file: pickle.dump(c, file) print(f'saving complete to {nm}') return c
import pandas as pd df = pd.read_csv("data.csv") y = df.pop("threshold") X = df from sklearn.svm import LinearSVR svr = LinearSVR(epsilon=0.2) svr.fit(X, y) print(svr.intercept_) print(svr.coef_)
def main(): MAX_ITER = 5000 regressors = { "SVR": lambda: SVR(max_iter=MAX_ITER), "SVR_lin": lambda: LinearSVR(max_iter=MAX_ITER), "DTR": lambda: DecisionTreeRegressor(), "KNN": lambda: KNeighborsRegressor(n_neighbors=10), "MLP": lambda: MLPRegressor([256] * 3), "MLP_large": lambda: MLPRegressor([1024] * 5), "DUMMY": lambda: DummyRegressor() } parser = ArgumentParser() parser.add_argument("input_file", help="Input data csv file") parser.add_argument("-output_file", help="Output data csv file", default="regression_results.csv") parser.add_argument("-folds", help="Number of folds", default=10) parser.add_argument("-seed", help="Random seed", default=1337) parser.add_argument("-regressors", help="Regressors to use", default=" ".join(regressors.keys())) args = parser.parse_args() # opts # load data # Expected csv format: # exp_id representation filename inst_frac feat_frac classifier fold # accuracy nr_instances nr_features nr_missing_values mean_kurtosis mean_skewness # mean Info_gain Inf_gain_ratio input_file = args.input_file regressor_names = args.regressors.split() num_folds = args.folds output_path = args.output_file seed = args.seed input_data = pd.read_csv(input_file) print("Read raw input data with shape:", input_data.shape) # Given the input data, potentially aggregate some accuracy values (e.g. over all folds) data, labels = get_data_and_labels_from_raw_inputs(input_data) print(f"Running regressions on {len(data)} data/label instances.") print("Run regression in terms of a single representation? GG clarify!") columns = ["id", "fold", "mse", "mae"] results = pd.DataFrame(columns=columns) results.to_csv(output_path, index=None) # cross-val splitter = KFold(num_folds, shuffle=True, random_state=seed) # iterate over folds for fold_idx, (train_idx, test_idx) in enumerate(splitter.split(data)): x_train, y_train = data[train_idx, :], labels[train_idx] x_test, y_test = data[test_idx, :], labels[test_idx] # iterate over regressors for regressor_name in regressor_names: model_func = regressors[regressor_name] # run the regression print(f"Running fold {fold_idx+1}/{num_folds} : {regressor_name}") regressor = model_func() mse, mae = run_regression(regressor, x_train, y_train, x_test, y_test, mmx_scale=(regressor_name == "dtr")) # get a run id, store results run_id = f"regressor_{regressor_name}" results = results.append( { "id": run_id, "fold": fold_idx, "mse": mse, "mae": mae }, ignore_index=True) # backup a copy shutil.copyfile(output_path, output_path + " .backup.csv") results.to_csv(output_path, index=None) print("Done!") print("Avg. per classifier:") print(results.groupby("id").mean())
y_te = np.zeros([test.shape[0]]) y_pred = np.zeros_like(y_true,dtype='float64') kf = KFold(n_splits=n_kf).split(train) for j,(train_index,test_index) in enumerate(kf): rgs.fit( train[train_index,:], y_true[train_index] ) y_pred[test_index] = rgs.predict( train[test_index] ) y_te += rgs.predict( test ) y_te /= n_kf print( '{0} score {1}'.format(ln,feval(y_true,y_pred))) return y_pred,y_te,feval(y_true,y_pred) train_true = DataFrame(o.get_table('y_train')).to_pandas() rgss = [LinearRegression(), LinearSVR(C=0.01), Ridge(alpha = 1.0)] rgss_name = ['LR','SVR','Ridge',] train_s = [] test_s = [] for i,rn in enumerate(rgss_name): n_kf = 10 y_valid = np.zeros([train_1.shape[0],5]) y_test = np.zeros([test_1.shape[0],5]) for j,ln in enumerate(label_name): y_true = train_true[ln].as_matrix() rgs = rgss[i] y_va,y_te,sc = stack_cell(rgs,n_kf,ln,y_true,(train_1,test_1),(train_2,test_2),(train_3,test_3),(train_4,test_4),(train_5,test_5),(train_6,test_6), (train_7,test_7),(train_8,test_8)) y_valid[:,j] = y_va
def StandardLinearSVR(epsilon=0.1): return Pipeline([('std_scaler', StandardScaler()), ('linearSVR', LinearSVR(epsilon=epsilon))])
t1[i], cmidd( X.iloc[:, i], # feature i y, # target X.iloc[:, int(F[int( m[i])])] # conditionned on selected features )) if t1[i] > sstar: sstar = t1[i] F[k + 1] = i F = np.array(F[F > -100]) F = F.astype(int) t1 = t1[F] regr = make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-3)) X_train_data = X.iloc[:, F[:10]] regr.fit(X_train_data, y_train_data) y_pred = regr.predict(X_test_data.iloc[:, F[:10]]) print(sum((y_pred - y_test_data)**2)) from sklearn.metrics import mean_squared_error print(mean_squared_error(y_pred, y_test_data)) error_rate = [] for i in range(1, 40): knn = KNeighborsRegressor(n_neighbors=i) knn.fit(X_train_data, y_train_data) pred_i = knn.predict(X_test_data.iloc[:, F[:10]]) error_rate.append(np.mean(pred_i != y_test_data))
y1 = train['SentimentTitle'] train_X_Headline = hstack( [train_vect_2_hst, csr_matrix(train_headline.values)]) test_X_Headline = hstack([test_vect_2_hst, csr_matrix(test_headline.values)]) y2 = train['SentimentHeadline'] np.shape(train_X_Title) #model for sentiment title X_train, X_test, y_train, y_test = train_test_split(train_X_Title, y1, test_size=0.20, random_state=42) LSVR1 = LinearSVR(C=0.2) LSVR1.fit(X_train, y_train) y_pred1 = LSVR1.predict(X_test) mae1 = mean_absolute_error(y_pred1, y_test) print('MAE:', 1 - mae1) X_train, X_test, y_train, y_test = train_test_split(train_X_Headline, y2, test_size=0.20, random_state=42) LSVR2 = LinearSVR(C=0.1) LSVR2.fit(X_train, y_train) y_pred2 = LSVR2.predict(X_test)
# # Regression # # In[22]: np.random.seed(42) m = 50 X = 2 * np.random.rand(m, 1) y = (4 + 3 * X + np.random.randn(m, 1)).ravel() # In[23]: from sklearn.svm import LinearSVR svm_reg = LinearSVR(epsilon=1.5, random_state=42) svm_reg.fit(X, y) # In[24]: svm_reg1 = LinearSVR(epsilon=1.5, random_state=42) svm_reg2 = LinearSVR(epsilon=0.5, random_state=42) svm_reg1.fit(X, y) svm_reg2.fit(X, y) def find_support_vectors(svm_reg, X, y): y_pred = svm_reg.predict(X) off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon) return np.argwhere(off_margin)
# Exercise 10 P166 # data set housing = fetch_california_housing() X = housing["data"] y = housing["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # scale scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # build model lin_svr = LinearSVR(random_state=42) lin_svr.fit(X_train_scaled, y_train) y_pred = lin_svr.predict(X_train_scaled) mse = mean_squared_error(y_train, y_pred) print('LinearSVR MSE: ', mse) # 0.949968822217229 not good print('LinearSVR RMSE: ', np.sqrt(mse)) # grid search the best estimator with SVR() model which can use kernel skill param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)} rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42) rnd_search_cv.fit(X_train_scaled, y_train) print('best estimator: ', rnd_search_cv.best_estimator_) '''SVR(C=4.745401188473625, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.07969454818643928, kernel='rbf', max_iter=-1, shrinking=True,
AdaBoostRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostHousing") build_housing(BayesianRidge(), "BayesianRidgeHousing") build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors=True) build_housing( MLPRegressor(activation="tanh", hidden_layer_sizes=(26, ), solver="lbfgs", random_state=13, tol=0.001, max_iter=1000), "MLPHousing") build_housing(SGDRegressor(random_state=13), "SGDHousing") build_housing(SVR(), "SVRHousing") build_housing(LinearSVR(random_state=13), "LinearSVRHousing") build_housing(NuSVR(), "NuSVRHousing") # # Anomaly detection # def build_iforest_housing(iforest, name, **pmml_options): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") decisionFunction = DataFrame(pipeline.decision_function(housing_X),
features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:-0.0028642237563477587 exported_pipeline = make_pipeline( StackingEstimator( estimator=GradientBoostingRegressor(alpha=0.85, learning_rate=0.01, loss="lad", max_depth=2, max_features=0.15000000000000002, min_samples_leaf=7, min_samples_split=7, n_estimators=100, subsample=0.4)), MinMaxScaler(), StackingEstimator(estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0001, loss="epsilon_insensitive", tol=1e-05)), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), RandomForestRegressor(bootstrap=False, max_features=0.45, min_samples_leaf=6, min_samples_split=3, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
coef0=1, C=5)))) poly_kernel_svm_clf.fit(X, y) rbf_kernel_svm_clf = Pipeline( (("scaler", StandardScaler()), ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001)))) rbf_kernel_svm_clf.fit(X, y) """ LinearSVC比SVC快得多(ker nel =“linear”)),特别是如果训练集非常大或者它有很多特征。 如果训练集不太大,则应该尝试高斯RBF内核;它在大多数情况下运作良好。 """ if False: from sklearn.svm import LinearSVR """ epsilon -> street width C large regularization small """ svm_reg = LinearSVR(epsilon=1.5) svm_reg.fit(X, y) """ SVR类是SVC类的回归等价物,LinearSVR类是LinearSVC类的回归等价物。 LinearSVR类与训练集的大小成线性关系(就像LinearSVC类一样),而当训练集变大时SVR类变得太慢(就像SVC类一样) """ from sklearn.svm import SVR svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1) svm_poly_reg.fit(X, y)
from sklearn.feature_selection import SelectPercentile, f_regression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MaxAbsScaler, PolynomialFeatures from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.633119693298434 exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), SelectPercentile(score_func=f_regression, percentile=89), MaxAbsScaler(), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), LinearSVR(C=1.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.0001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
min_child_weight=3, n_estimators=50, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant", loss="huber", penalty="elasticnet", power_t=0.0)), StackingEstimator(estimator=LinearSVR( C=25.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"), SelectPercentile(score_func=f_regression, percentile=6), StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.8, min_samples_leaf=19, min_samples_split=10, n_estimators=400)), ZeroCount(), FeatureAgglomeration(affinity="l2", linkage="complete"), StackingEstimator(estimator=RidgeCV()), RidgeCV()) exported_pipeline.fit(X, y) print(r2_score(y, exported_pipeline.predict(X))) _model = open("Tpot_bestmodel.pkl", "wb") pickle.dump(exported_pipeline, _model)
from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor models = [ LinearRegression(), Ridge(), # http://www.cnblogs.com/pinard/p/6023000.html Lasso( alpha=0.01, max_iter=10000 ), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html RandomForestRegressor( ), # https://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestRegressor.html GradientBoostingRegressor( ), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html SVR( ), # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR LinearSVR( ), # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html ElasticNet( alpha=0.001, max_iter=10000 ), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html SGDRegressor( max_iter=10000, tol=1e-3 ), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html BayesianRidge(), # KernelRidge( alpha=0.6, kernel='polynomial', degree=2, coef0=2.5 ), # https://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html ExtraTreesRegressor( ), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html XGBRegressor(), AdaBoostRegressor( n_estimators=50