示例#1
0
del globals()['unqLikesLIDs']
del globals()['profilesDF']
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    agrsARR,
                                                    test_size=1500)

myTOL = float(sys.argv[1])
mySVM = LinearSVR(tol=myTOL)
#mySVM.fit(likesMAT, agrsARR)
mySVM.fit(X_train, y_train)

y_pred = mySVM.predict(X_test)
import math
myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("agrs, Linear SVM:  ", str(myTOL), " ", myRMSE)

# joblib.dump(mySVM, "/Users/jamster/LinearSVM-A-agrs.xz", compress=9)

# impSVM = joblib.load("/Users/jamster/LinearSVM-A-agrs.xz")
示例#2
0
def benchmark_regression(X_train, X_test, y_train, y_test, epsilon, delta):
    report = []

    # SGDRegressor - Local Differential Privacy
    for alpha in [0.01, 0.1, 1.0, 10.0, 100.0]:
        model = SGDRegressor(alpha=alpha,
                             loss='huber',
                             max_iter=1000,
                             tol=1e-3)
        start = time.time()
        X_train_ldp, y_train_ldp = make_ldp(X_train,
                                            y_train,
                                            epsilon,
                                            delta,
                                            classification=False)
        model.fit(X_train_ldp, y_train_ldp)
        report.append({
            "type": "bounded",
            "model": type(model).__name__,
            "hyperparameters": "alpha=%s" % alpha,
            "epsilon": epsilon,
            "accuracy": model.score(X_test, y_test),
            "time": time.time() - start
        })

    # LinearSVR - Local Differential Privacy
    for C in [1.0, 10.0, 100.0, 1000.0]:
        model = LinearSVR(C=C, max_iter=10000)
        start = time.time()
        X_train_ldp, y_train_ldp = make_ldp(X_train,
                                            y_train,
                                            epsilon,
                                            delta,
                                            classification=False)
        model.fit(X_train_ldp, y_train_ldp)
        report.append({
            "type": "bounded",
            "model": type(model).__name__,
            "hyperparameters": "C=%s" % C,
            "epsilon": epsilon,
            "accuracy": model.score(X_test, y_test),
            "time": time.time() - start
        })

    # RandomForestRegressor - Local Differential Privacy
    for n_estimators in [10, 50, 100, 1000]:
        model = RandomForestRegressor(n_estimators=n_estimators)
        start = time.time()
        X_train_ldp, y_train_ldp = make_ldp(X_train, y_train, epsilon, delta)
        model.fit(X_train_ldp, y_train_ldp)
        report.append({
            "type": "bounded",
            "model": type(model).__name__,
            "hyperparameters": "n_estimators=%s" % n_estimators,
            "epsilon": epsilon,
            "accuracy": model.score(X_test, y_test),
            "time": time.time() - start
        })

    # LinearRegression - Integrated
    model = regression.LinearRegression(epsilon=epsilon)
    start = time.time()
    model.fit(X_train, y_train)
    report.append({
        "type": "integrated",
        "model": type(model).__name__,
        "hyperparameters": "",
        "epsilon": epsilon,
        "accuracy": model.score(X_test, y_test),
        "time": time.time() - start
    })

    # FederatedLearningRegressor - Gradient
    for epochs in [8, 16, 32]:
        model = FederatedLearningRegressor(epsilon,
                                           delta,
                                           epochs=epochs,
                                           lr=1e-2)
        start = time.time()
        model.fit(X_train, y_train)
        report.append({
            "type": "gradient",
            "model": type(model).__name__,
            "hyperparameters": "epochs=%s" % epochs,
            "epsilon": epsilon,
            "accuracy": model.score(X_test, y_test),
            "time": time.time() - start
        })

    return pd.DataFrame(report)
示例#3
0
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

from sklearn.linear_model import Ridge
model = RidgeCV()
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

from sklearn.svm import SVR
model = SVR()
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

from sklearn.neural_network import MLPRegressor
model = MLPRegressor()
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

from sklearn.svm import LinearSVR
model = LinearSVR()
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

from sklearn.linear_model import SGDRegressor
model = SGDRegressor()
regressor(X_train, y_train, X_test, y_test, ['Street'], model)

# get number of categories in variables
    print "--------------------------------------------"

    val_id = fold_ids.ix[:, i].dropna()
    idx = train["Id"].isin(list(val_id))

    trainingSet = train[~idx]
    validationSet = train[idx]

    tr_X = np.matrix(trainingSet[feature_names])
    tr_Y = np.array(trainingSet["Response"])
    val_X = np.matrix(validationSet[feature_names])
    val_Y = np.array(validationSet["Response"])

    regm = LinearSVR(C=0.06,
                     epsilon=0.45,
                     tol=1e-5,
                     dual=True,
                     verbose=True,
                     random_state=133)

    regm.fit(tr_X, tr_Y)
    preds = regm.predict(val_X)

    df = pd.DataFrame(
        dict({
            "Id": validationSet["Id"],
            "ground_truth": validationSet["Response"],
            "linsvr_preds": preds
        }))

    linsvr_val = linsvr_val.append(df, ignore_index=True)
df = df.iloc[:2949, :]
import pickle
df.to_pickle("Final_Data")
df.read_pickle("Final_Data")

for idx, row in output_df.iterrows():
    df.loc[row['FIPS'], 'annual_count_avg'] = row['Average Annual Count']

X = df.loc[:, :'WATR']
y = df['annual_count_avg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

from sklearn.svm import LinearSVR
svr = LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train)
svr.score(X_test, y_test)

from sklearn import svm
svm = svm.SVR().fit(X_train, y_train)
svm.score(X_test, y_test)

from sklearn.svm import NuSVR
nuSVR = NuSVR().fit(X_train, y_train)
nuSVR.score(X_test, y_test)

from sklearn import linear_model
ridge = linear_model.Ridge(alpha=0.5).fit(X_train, y_train)
ridge.score(X_test, y_test)
np.argmax(ridge.coef_)
示例#6
0
lgb_params = {
    'feature_fraction': 0.75,
    'metric': 'rmse',
    'nthread': 1,
    'min_data_in_leaf': 2**7,
    'bagging_fraction': 0.75,
    'learning_rate': 0.03,
    'objective': 'mse',
    'bagging_seed': 2**7,
    'num_leaves': 2**7,
    'bagging_freq': 1,
    'verbose': 0
}

svm = LinearSVR(C=1.0, verbose=True)
scaler = StandardScaler()

train['target'] = train['target'].clip(0, 20)

print('--------------- Scaling Features --------------')
x_train = train.drop(to_drop, axis=1)
x_train = downcast_types(x_train)
x_train = scaler.fit_transform(x_train)
y_train = train['target']
test.drop(to_drop, axis=1, inplace=True)
test = downcast_types(test)
test = scaler.transform(test)

del train
gc.collect()
示例#7
0
def StandardLinearSVR(epsilon=0.1):
    return Pipeline([("std_scaler", StandardScaler()),
                     ("linearSVR", LinearSVR(epsilon=epsilon))])
示例#8
0
def default_datasets(carrier, id_airport):
    # # **Predicting flight delays**

    # In this notebook, we developed the model aimed at predicting flight delays at take-off.

    # During the EDA, we intended to create good quality figures

    # This notebook is composed of three parts:
    # Cleaning
    #   *  Date and Times
    #   *  Missing Values

    # Exploration
    #   * Graphs
    #   * Impact of Departure Vs Arrival Delays

    # Modeling
    # The model is developed for one airport and one airline
    #   * Linear
    #   * Ridge
    #   * Random Forest
    #   * Neural Networks
    #   * SVM

    # In[2]:

    import datetime, warnings, scipy
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import matplotlib.patches as patches
    from matplotlib.patches import ConnectionPatch
    from collections import OrderedDict
    from matplotlib.gridspec import GridSpec
    from sklearn import metrics, linear_model
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
    from scipy.optimize import curve_fit
    from sklearn.metrics import r2_score
    from random import sample
    import matplotlib.patches as mpatches
    from sklearn.linear_model import Ridge
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score
    from scipy.stats import spearmanr, pearsonr
    from sklearn.svm import SVR
    plt.rcParams["patch.force_edgecolor"] = True
    plt.style.use('fivethirtyeight')
    mpl.rc('patch', edgecolor='dimgray', linewidth=1)
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = "last_expr"
    pd.options.display.max_columns = 50
    #get_ipython().magic('matplotlib inline')
    warnings.filterwarnings("ignore")

    # In[2]:

    df = pd.read_csv(
        '/Users/sarveshprattipati/Downloads/flight-delays/flights.csv',
        low_memory=False)
    print('Dataframe dimensions:', df.shape)

    airports = pd.read_csv(
        "/Users/sarveshprattipati/Downloads/flight-delays/airports.csv")

    airlines_names = pd.read_csv(
        '/Users/sarveshprattipati/Downloads/flight-delays/airlines.csv')
    airlines_names

    abbr_companies = airlines_names.set_index('IATA_CODE')['AIRLINE'].to_dict()

    carrier = 'AA'
    id_airport = 'DFW'

    # %%

    # # 1. Cleaning

    # # 1.1 Dates and times
    #
    # **YEAR, MONTH, DAY**, is merged into date column

    df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])

    # Moreover, in the **SCHEDULED_DEPARTURE** variable, the hour of the take-off is coded as a float where the two first digits indicate the hour and the two last, the minutes. This format is not convenient and I thus convert it. Finally, I merge the take-off hour with the flight date. To proceed with these transformations, I define a few functions:

    # Function that converts the 'HHMM' string to datetime.time
    def format_heure(chaine):
        if pd.isnull(chaine):
            return np.nan
        else:
            if chaine == 2400: chaine = 0
            chaine = "{0:04d}".format(int(chaine))
            heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
            return heure

    # Function that combines a date and time to produce a datetime.datetime
    def combine_date_heure(x):
        if pd.isnull(x[0]) or pd.isnull(x[1]):
            return np.nan
        else:
            return datetime.datetime.combine(x[0], x[1])

    # Function that combine two columns of the dataframe to create a datetime format
    def create_flight_time(df, col):
        liste = []
        for index, cols in df[['DATE', col]].iterrows():
            if pd.isnull(cols[1]):
                liste.append(np.nan)
            elif float(cols[1]) == 2400:
                cols[0] += datetime.timedelta(days=1)
                cols[1] = datetime.time(0, 0)
                liste.append(combine_date_heure(cols))
            else:
                cols[1] = format_heure(cols[1])
                liste.append(combine_date_heure(cols))
        return pd.Series(liste)

    df['SCHEDULED_DEPARTURE'] = create_flight_time(df, 'SCHEDULED_DEPARTURE')
    df['DEPARTURE_TIME'] = df['DEPARTURE_TIME'].apply(format_heure)
    df['SCHEDULED_ARRIVAL'] = df['SCHEDULED_ARRIVAL'].apply(format_heure)
    df['ARRIVAL_TIME'] = df['ARRIVAL_TIME'].apply(format_heure)
    # __________________________________________________________________________
    # df.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME',
    #             'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']]

    # The content of the **DEPARTURE_TIME** and **ARRIVAL_TIME** variables can be a bit misleading.
    # the first entry of the dataframe, the scheduled departure is at 0h05 the 1st of January.
    # ### 1.2 Filling factor
    #
    # Finally, the data frame is cleaned and few columns are dropped
    variables_to_remove = [
        'TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 'MONTH',
        'DAY', 'DAY_OF_WEEK', 'DATE', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY',
        'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DIVERTED',
        'CANCELLED', 'CANCELLATION_REASON', 'FLIGHT_NUMBER', 'TAIL_NUMBER',
        'AIR_TIME'
    ]
    df.drop(variables_to_remove, axis=1, inplace=True)
    df = df[[
        'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
        'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
        'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'SCHEDULED_TIME',
        'ELAPSED_TIME'
    ]]
    # df[:5]

    missing_df = df.isnull().sum(axis=0).reset_index()
    missing_df.columns = ['variable', 'missing values']
    missing_df['filling factor (%)'] = (
        df.shape[0] - missing_df['missing values']) / df.shape[0] * 100
    missing_df.sort_values('filling factor (%)').reset_index(drop=True)

    # The filling factor is quite good (> 97%). So dropping the rows with NA is a good option
    df.dropna(inplace=True)

    # %%
    # # 2. Exploration
    # # 2.1 Basic statistical description of airlines

    # function for statistical parameters from a grouby object:
    def get_stats(group):
        return {
            'min': group.min(),
            'max': group.max(),
            'count': group.count(),
            'mean': group.mean()
        }

    global_stats = df['DEPARTURE_DELAY'].groupby(
        df['AIRLINE']).apply(get_stats).unstack()
    global_stats = global_stats.sort_values('count')
    global_stats

    # In[15]:

    # # 2.1 Graphs

    # Pie chart for

    font = {'family': 'normal', 'weight': 'bold', 'size': 15}
    mpl.rc('font', **font)

    # __________________________________________________________________
    # I extract a subset of columns and redefine the airlines labeling
    df2 = df.loc[:, ['AIRLINE', 'DEPARTURE_DELAY']]
    df2['AIRLINE'] = df2['AIRLINE'].replace(abbr_companies)
    # ________________________________________________________________________
    colors = [
        'royalblue', 'grey', 'wheat', 'c', 'firebrick', 'seagreen',
        'lightskyblue', 'lightcoral', 'yellowgreen', 'gold', 'tomato',
        'violet', 'aquamarine', 'chartreuse'
    ]
    # ___________________________________
    fig = plt.figure(1, figsize=(16, 15))
    gs = GridSpec(2, 1)
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[1, 0])
    labels = [s for s in global_stats.index]
    # ----------------------------------------
    # Pie chart for mean delay at departure
    # ----------------------------------------
    sizes = global_stats['mean'].values
    sizes = [max(s, 0) for s in sizes]
    explode = [
        0.0 if sizes[i] < 20000 else 0.01 for i in range(len(abbr_companies))
    ]
    patches, texts, autotexts = ax1.pie(
        sizes,
        explode=explode,
        labels=labels,
        colors=colors,
        shadow=False,
        startangle=0,
        autopct=lambda p: '{:.0f}'.format(p * sum(sizes) / 100))
    for i in range(len(abbr_companies)):
        texts[i].set_fontsize(14)
    ax1.axis('equal')
    ax1.set_title('Mean delay at origin',
                  bbox={
                      'facecolor': 'midnightblue',
                      'pad': 5
                  },
                  color='w',
                  fontsize=18)
    # ------------------------------------------------------
    # striplot with all the values for the delays
    # ___________________________________________________________________
    # Defining the colors for correspondance with the pie charts
    colors = [
        'firebrick', 'gold', 'lightcoral', 'aquamarine', 'c', 'yellowgreen',
        'grey', 'seagreen', 'tomato', 'violet', 'wheat', 'chartreuse',
        'lightskyblue', 'royalblue'
    ]
    # ___________________________________________________________________
    ax2 = sns.stripplot(y="AIRLINE",
                        x="DEPARTURE_DELAY",
                        size=4,
                        palette=colors,
                        data=df2,
                        linewidth=0.5,
                        jitter=True)
    plt.setp(ax2.get_xticklabels(), fontsize=14)
    plt.setp(ax2.get_yticklabels(), fontsize=14)
    ax2.set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*[int(y) for y in divmod(x, 60)])
        for x in ax2.get_xticks()
    ])
    plt.xlabel('Departure delay',
               fontsize=18,
               bbox={
                   'facecolor': 'midnightblue',
                   'pad': 5
               },
               color='w',
               labelpad=20)
    ax2.yaxis.label.set_visible(False)
    # ________________________
    plt.tight_layout(w_pad=3)

    # If we Exclude Hawaiian Airlines and Alaska Airlines, which have low mean delays, the mean delay would be 11 ± 7 minutes
    # The second graph shows that, incase of mean delay being 11 minutes, there might be hours delay for some flights

    # In[16]:

    # # 2.1 Graphs

    # Function defining how delays are grouped
    delay_type = lambda x: ((0, 1)[x > 5], 2)[x > 45]
    df['DELAY_LEVEL'] = df['DEPARTURE_DELAY'].apply(delay_type)

    fig = plt.figure(1, figsize=(10, 7))
    ax = sns.countplot(y="AIRLINE", hue='DELAY_LEVEL', data=df)

    # We replace the abbreviations by the full names of the companies and set the labels
    labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    plt.setp(ax.get_xticklabels(), fontsize=12, weight='normal', rotation=0)
    plt.setp(ax.get_yticklabels(), fontsize=12, weight='bold', rotation=0)
    ax.yaxis.label.set_visible(False)
    plt.xlabel('Flight count', fontsize=16, weight='bold', labelpad=10)

    # Set the legend
    L = plt.legend()
    L.get_texts()[0].set_text('on time (t < 5 min)')
    L.get_texts()[1].set_text('small delay (5 < t < 45 min)')
    L.get_texts()[2].set_text('large delay (t > 45 min)')
    plt.show()

    # %%

    # # 2.2 Impact of Departure Vs Arrival Delays

    mpl.rcParams.update(mpl.rcParamsDefault)
    mpl.rcParams['hatch.linewidth'] = 2.0

    fig = plt.figure(1, figsize=(11, 6))
    ax = sns.barplot(x="DEPARTURE_DELAY",
                     y="AIRLINE",
                     data=df,
                     color="lightskyblue",
                     ci=None)
    ax = sns.barplot(x="ARRIVAL_DELAY",
                     y="AIRLINE",
                     data=df,
                     color="r",
                     hatch='///',
                     alpha=0.0,
                     ci=None)
    labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    ax.yaxis.label.set_visible(False)
    plt.xlabel('Mean delay [min] (@departure: blue, @arrival: hatch lines)',
               fontsize=14,
               weight='bold',
               labelpad=10)

    # This figure shows arrival delays are lower than departure delays.
    # The arrival delays can be compensated during air travel.

    # So for this project we have estimating the departure delays.

    # %%

    # ### 2.2 Vizualization for delays at origin airports

    airport_mean_delays = pd.DataFrame(pd.Series(
        df['ORIGIN_AIRPORT'].unique()))
    airport_mean_delays.set_index(0, drop=True, inplace=True)

    for carrier in abbr_companies.keys():
        df1 = df[df['AIRLINE'] == carrier]
        test = df1['DEPARTURE_DELAY'].groupby(
            df['ORIGIN_AIRPORT']).apply(get_stats).unstack()
        airport_mean_delays[carrier] = test.loc[:, 'mean']

    temp_airports = airports
    identify_airport = temp_airports.set_index('IATA_CODE')['CITY'].to_dict()

    sns.set(context="paper")
    fig = plt.figure(1, figsize=(8, 8))

    ax = fig.add_subplot(1, 2, 1)
    subset = airport_mean_delays.iloc[:50, :].rename(columns=abbr_companies)
    subset = subset.rename(index=identify_airport)
    mask = subset.isnull()
    sns.heatmap(subset,
                linewidths=0.01,
                cmap="Accent",
                mask=mask,
                vmin=0,
                vmax=35)
    plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85)
    ax.yaxis.label.set_visible(False)

    ax = fig.add_subplot(1, 2, 2)
    subset = airport_mean_delays.iloc[50:100, :].rename(columns=abbr_companies)
    subset = subset.rename(index=identify_airport)
    fig.text(0.5,
             1.02,
             "Delays: impact of the origin airport",
             ha='center',
             fontsize=18)
    mask = subset.isnull()
    sns.heatmap(subset,
                linewidths=0.01,
                cmap="Accent",
                mask=mask,
                vmin=0,
                vmax=35)
    plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85)
    ax.yaxis.label.set_visible(False)

    plt.tight_layout()

    # From the above graph, we deduce
    # American eagle has large delays
    # Delta airlines has delays less than 5 minutes
    # Few airports favour late departure,like Denver, Chicago

    # In[32]:

    # Common class for graphs
    class Figure_style():
        # _________________________________________________________________
        def __init__(self, size_x=11, size_y=5, nrows=1, ncols=1):
            sns.set_style("white")
            sns.set_context("notebook",
                            font_scale=1.2,
                            rc={"lines.linewidth": 2.5})
            self.fig, axs = plt.subplots(nrows=nrows,
                                         ncols=ncols,
                                         figsize=(
                                             size_x,
                                             size_y,
                                         ))
            # ________________________________
            # convert self.axs to 2D array
            if nrows == 1 and ncols == 1:
                self.axs = np.reshape(axs, (1, -1))
            elif nrows == 1:
                self.axs = np.reshape(axs, (1, -1))
            elif ncols == 1:
                self.axs = np.reshape(axs, (-1, 1))

        # _____________________________
        def pos_update(self, ix, iy):
            self.ix, self.iy = ix, iy

        # _______________
        def style(self):
            self.axs[self.ix, self.iy].spines['right'].set_visible(False)
            self.axs[self.ix, self.iy].spines['top'].set_visible(False)
            self.axs[self.ix, self.iy].yaxis.grid(color='lightgray',
                                                  linestyle=':')
            self.axs[self.ix, self.iy].xaxis.grid(color='lightgray',
                                                  linestyle=':')
            self.axs[self.ix, self.iy].tick_params(axis='both',
                                                   which='major',
                                                   labelsize=10,
                                                   size=5)

        # ________________________________________
        def draw_legend(self, location='upper right'):
            legend = self.axs[self.ix, self.iy].legend(loc=location,
                                                       shadow=True,
                                                       facecolor='g',
                                                       frameon=True)
            legend.get_frame().set_facecolor('whitesmoke')

        # _________________________________________________________________________________
        def cust_plot(self,
                      x,
                      y,
                      color='b',
                      linestyle='-',
                      linewidth=1,
                      marker=None,
                      label=''):
            if marker:
                markerfacecolor, marker, markersize = marker[:]
                self.axs[self.ix,
                         self.iy].plot(x,
                                       y,
                                       color=color,
                                       linestyle=linestyle,
                                       linewidth=linewidth,
                                       marker=marker,
                                       label=label,
                                       markerfacecolor=markerfacecolor,
                                       markersize=markersize)
            else:
                self.axs[self.ix, self.iy].plot(x,
                                                y,
                                                color=color,
                                                linestyle=linestyle,
                                                linewidth=linewidth,
                                                label=label)
            self.fig.autofmt_xdate()

        # ________________________________________________________________________
        def cust_plot_date(self,
                           x,
                           y,
                           color='lightblue',
                           linestyle='-',
                           linewidth=1,
                           markeredge=False,
                           label=''):
            markeredgewidth = 1 if markeredge else 0
            self.axs[self.ix,
                     self.iy].plot_date(x,
                                        y,
                                        color='lightblue',
                                        markeredgecolor='grey',
                                        markeredgewidth=markeredgewidth,
                                        label=label)

        # ________________________________________________________________________
        def cust_scatter(self,
                         x,
                         y,
                         color='lightblue',
                         markeredge=False,
                         label=''):
            markeredgewidth = 1 if markeredge else 0
            self.axs[self.ix, self.iy].scatter(x,
                                               y,
                                               color=color,
                                               edgecolor='grey',
                                               linewidths=markeredgewidth,
                                               label=label)
            #

        def set_xlabel(self, label, fontsize=14):
            self.axs[self.ix, self.iy].set_xlabel(label, fontsize=fontsize)

        def set_ylabel(self, label, fontsize=14):
            self.axs[self.ix, self.iy].set_ylabel(label, fontsize=fontsize)

        # ____________________________________
        def set_xlim(self, lim_inf, lim_sup):
            self.axs[self.ix, self.iy].set_xlim([lim_inf, lim_sup])

        # ____________________________________
        def set_ylim(self, lim_inf, lim_sup):
            self.axs[self.ix, self.iy].set_ylim([lim_inf, lim_sup])

    # Sampling the data with 80:20 training and test data set
    df_train = df.sample(frac=0.8)
    df_test = df.loc[~df.index.isin(df_train.index)]
    df = df_train

    # In[37]:
    # Defining dataframe creation function
    ###########################################################################
    def get_flight_delays(df, carrier, id_airport, extrem_values=False):
        df2 = df[(df['AIRLINE'] == carrier)
                 & (df['ORIGIN_AIRPORT'] == id_airport)]
        # _______________________________________
        # remove extreme values before fitting
        if extrem_values:
            df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply(
                lambda x: x if x < 60 else np.nan)
            df2.dropna(how='any')
        # __________________________________

        df2.sort_values('SCHEDULED_DEPARTURE', inplace=True)
        df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply(
            lambda x: x.time())
        # ___________________________________________________________________

        test2 = df2['DEPARTURE_DELAY'].groupby(
            df2['schedule_depart']).apply(get_stats).unstack()
        test2.reset_index(inplace=True)
        # ___________________________________

        fct = lambda x: x.hour * 60 + x.minute
        test2.reset_index(inplace=True)
        test2['schedule_depart_mnts'] = test2['schedule_depart'].apply(fct)
        return test2

    def create_df(df, carrier, id_airport, extrem_values=False):
        df2 = df[(df['AIRLINE'] == carrier)
                 & (df['ORIGIN_AIRPORT'] == id_airport)]
        df2.dropna(how='any', inplace=True)
        df2['weekday'] = df2['SCHEDULED_DEPARTURE'].apply(
            lambda x: x.weekday())
        # ____________________
        # delete delays > 1h
        df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply(
            lambda x: x if x < 60 else np.nan)
        df2.dropna(how='any', inplace=True)
        # _________________
        # formating times
        fct = lambda x: x.hour * 60 + x.minute
        df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply(
            lambda x: x.time())
        df2['schedule_depart_mnts'] = df2['schedule_depart'].apply(fct)
        df2['schedule_arrivee'] = df2['SCHEDULED_ARRIVAL'].apply(fct)
        df3 = df2.groupby(['schedule_depart_mnts', 'schedule_arrivee'],
                          as_index=False).mean()
        return df3

    #
    # In[39]:
    # Linear Regression
    ####### Linear_Train #######

    test2 = get_flight_delays(df, carrier, id_airport, False)
    test2.to_csv('Model_dataset.csv', sep=',')

    test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X_L_train = np.array(test['schedule_depart_mnts'])
    Y_L_train = np.array(test['mean'])
    X_L_train = X_L_train.reshape(len(X_L_train), 1)
    Y_L_train = Y_L_train.reshape(len(Y_L_train), 1)
    regr = linear_model.LinearRegression()
    regr.fit(X_L_train, Y_L_train)
    result_L_train = regr.predict(X_L_train)
    score_L_train = regr.score(X_L_train, Y_L_train)

    # print("R^2 for Linear Train= ",score_L_train)
    print("MSE Linear Train=",
          metrics.mean_squared_error(result_L_train, Y_L_train))

    # The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares
    # ((y_true - y_pred) ** 2).sum() and v is the
    # total sum of squares ((y_true - y_true.mean()) ** 2).sum().

    ####### Linear_Test #######
    test2 = get_flight_delays(df_test, carrier, id_airport, False)

    test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X_L_test = np.array(test['schedule_depart_mnts'])
    Y_L_test = np.array(test['mean'])
    X_L_test = X_L_test.reshape(len(X_L_test), 1)
    Y_L_test = Y_L_test.reshape(len(Y_L_test), 1)
    result_L_test = regr.predict(X_L_test)
    score_L_test = regr.score(X_L_test, Y_L_test)

    # print("R^2 for Linear Test= ",score_L_test)
    print("MSE Linear Test=",
          metrics.mean_squared_error(result_L_test, Y_L_test))
    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_L_test,
                   Y_L_test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_L_test, result_L_test, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # In[77]:
    # Ridge Regression
    ####### Ridge_Training #######
    df3 = get_flight_delays(df, carrier, id_airport)
    df3[:5]
    # df1 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)]
    # df1['heure_depart'] =  df1['SCHEDULED_DEPARTURE'].apply(lambda x:x.time())
    # df1['heure_depart'] = df1['heure_depart'].apply(lambda x:x.hour*60+x.minute)
    df3 = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X = np.array(df3['schedule_depart_mnts'])
    Y = np.array(df3['mean'])
    X = X.reshape(len(X), 1)
    Y = Y.reshape(len(Y), 1)

    parameters = [0.2, 1]
    ridgereg = Ridge(alpha=parameters[0], normalize=True)
    poly = PolynomialFeatures(degree=parameters[1])
    X_ = poly.fit_transform(X)
    ridgereg.fit(X_, Y)
    result_R_train = ridgereg.predict(X_)
    score_R_train = metrics.mean_squared_error(result_R_train, Y)
    r2_R_train = regr.score(X, Y)
    # print("R^2 for Ridge Train:",r2_R_train )
    print('MSE Ridge Train= {}'.format(round(score_R_train, 2)))

    ####### Ridge_Test #######

    df3 = get_flight_delays(df_test, carrier, id_airport)
    df3[:5]

    test = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X_L_test = np.array(test['schedule_depart_mnts'])
    Y_L_test = np.array(test['mean'])
    X_testt = X.reshape(len(X), 1)
    Y_testt = Y.reshape(len(Y), 1)

    X_ = poly.fit_transform(X_testt)
    result_test = ridgereg.predict(X_)

    score_R_test = metrics.mean_squared_error(result_test, Y_testt)

    r2_ridge_test = r2_score(X_testt, Y_testt)
    # print("R^2 for Ridge Test is: ",r2_ridge_test )
    print('MSE Ridge Test = {}'.format(round(np.sqrt(score_R_test), 2)))
    # 'Ecart = {:.2f} min'.format(np.sqrt(score_R_test))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_testt,
                   Y_testt,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_testt, result_test, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # %%
    ###########################################################################
    ####### Random Forest_Train #######
    df4 = create_df(df, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df4 = df4[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                  axis=0)
    X_rf_Train = np.array(df4['schedule_depart_mnts'])
    Y_rf_Train = np.array(df4['DEPARTURE_DELAY'])

    X_rf_Train = X_rf_Train.reshape(len(X_rf_Train), 1)
    Y_rf_Train = Y_rf_Train.reshape(len(Y_rf_Train), 1)

    rf = RandomForestRegressor(n_estimators=100,
                               oob_score=True,
                               random_state=123456)
    rf.fit(X_rf_Train, Y_rf_Train)

    predicted_train = rf.predict(X_rf_Train)

    test_score = r2_score(Y_rf_Train, predicted_train)
    spearman = spearmanr(Y_rf_Train, predicted_train)
    # pearson = pearsonr(Y_rf_Train, predicted_train)

    # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}')
    # print(f'Test data R-2 score: {test_score:>5.3}')
    # print(f'Test data Spearman correlation: {spearman[0]:.3}')

    # print("R^2 for RF Train:",test_score )
    print('MSE RF Train= {}'.format(
        round(metrics.mean_squared_error(predicted_train, Y_rf_Train), 2)))
    # print(f'Test data Pearson correlation: {pearson[0]:.3}')

    ####### Random Forest_Test #######
    df41 = create_df(df_test, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df41 = df41[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                    axis=0)
    X_rf_Test = np.array(df41['schedule_depart_mnts'])
    Y_rf_Test = np.array(df41['DEPARTURE_DELAY'])

    X_rf_Test = X_rf_Test.reshape(len(X_rf_Test), 1)
    Y_rf_Test = Y_rf_Test.reshape(len(Y_rf_Test), 1)

    predicted_test = rf.predict(X_rf_Test)

    test_score = r2_score(Y_rf_Test, predicted_test)
    spearman = spearmanr(Y_rf_Test, predicted_test)
    # pearson = pearsonr(Y_rf_Train, predicted_train)

    # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}')
    # print(f'Test data R-2 score: {test_score:>5.3}')
    # print(f'Test data Spearman correlation: {spearman[0]:.3}')

    score_rf_test = r2_score(X_rf_Test, Y_rf_Test)
    # print("R^2 for RF Test: ",score_rf_test )
    score_RF_test = metrics.mean_squared_error(predicted_test, Y_rf_Test)
    print(' MSE RF Test = {}'.format(round(score_RF_test, 2)))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_rf_Test,
                   Y_rf_Test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_rf_Test, predicted_test, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # %%
    ###########################################################################
    ####### Neural Network_Train #######

    df5 = create_df(df, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df5 = df5[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                  axis=0)
    X_nn_Train = np.array(df5['schedule_depart_mnts'])
    Y_nn_Train = np.array(df5['DEPARTURE_DELAY'])

    X_nn_Train = X_nn_Train.reshape(len(X_nn_Train), 1)
    Y_nn_Train = Y_nn_Train.reshape(len(Y_nn_Train), 1)

    regr = LinearSVR(random_state=0)
    #    from sknn.mlp import Classifier, Layer
    #    #regr = LinearSVR(random_state=0)
    #    regr = Classifier(
    #    layers=[
    #        Layer("Rectifier", units=10),
    #        Layer("Linear")],
    #    learning_rate=0.02,
    #    n_iter=5)
    regr.fit(X_nn_Train, Y_nn_Train)

    predict_train_NN = regr.predict(X_nn_Train)

    r2_NN_train = r2_score(Y_nn_Train, predict_train_NN)
    # print("R^2 for NN Train:",r2_NN_train )
    print('MSE NN Train= {}'.format(
        round(metrics.mean_squared_error(predict_train_NN, Y_nn_Train), 2)))

    ####### Neural Network_Test #######
    df51 = create_df(df_test, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df51 = df51[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                    axis=0)
    X_NN_Test = np.array(df51['schedule_depart_mnts'])
    Y_NN_Test = np.array(df51['DEPARTURE_DELAY'])

    X_NN_Test = X_NN_Test.reshape(len(X_NN_Test), 1)
    Y_NN_Test = Y_NN_Test.reshape(len(Y_NN_Test), 1)

    predict_test_NN = regr.predict(X_NN_Test)

    score_NN_test = r2_score(X_NN_Test, Y_NN_Test)
    # print("R^2 for NN Test: ",score_NN_test )
    MSE_NN_test = metrics.mean_squared_error(predict_test_NN, Y_NN_Test)
    print('MSE NN Test = {}'.format(round(MSE_NN_test, 2)))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_NN_Test,
                   Y_NN_Test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_NN_Test, predict_test_NN, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)

    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # %%

    ###########################################################################
    ####### SVM_Train #######

    df6 = create_df(df, carrier, id_airport)
    df6 = df6[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                  axis=0)
    X_svm_Train = np.array(df6['schedule_depart_mnts'])
    Y_svm_Train = np.array(df6['DEPARTURE_DELAY'])

    X_svm_Train = X_svm_Train.reshape(len(X_svm_Train), 1)
    Y_svm_Train = Y_svm_Train.reshape(len(Y_svm_Train), 1)

    regr = SVR(kernel='linear')

    regr.fit(X_svm_Train, Y_svm_Train)

    predict_train_svm = regr.predict(X_svm_Train)
    r2_svm_train = r2_score(Y_nn_Train, predict_train_svm)
    # print("R^2 for svm Train:",r2_svm_train )
    print('MSE svm Train= {}'.format(
        round(metrics.mean_squared_error(predict_train_svm, Y_svm_Train), 2)))

    ####### SVM_Test #######
    df61 = create_df(df_test, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df61 = df61[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                    axis=0)
    X_svm_Test = np.array(df61['schedule_depart_mnts'])
    Y_svm_Test = np.array(df61['DEPARTURE_DELAY'])

    X_svm_Test = X_svm_Test.reshape(len(X_svm_Test), 1)
    Y_svm_Test = Y_svm_Test.reshape(len(Y_svm_Test), 1)

    predict_test_svm = regr.predict(X_svm_Test)

    r2_svm_test = r2_score(X_svm_Test, Y_svm_Test)
    # print("R^2 for svm Test: ",r2_svm_test )
    mse_svm_test = metrics.mean_squared_error(predict_test_svm, Y_svm_Test)
    print('MSE svm Test= {}'.format(round(mse_svm_test, 2)))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_svm_Test,
                   Y_svm_Test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_svm_Test, predict_test_svm, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    return np.mean(result_L_test), np.mean(result_test), np.mean(
        predicted_test), np.mean(predict_test_NN), np.mean(predict_test_svm)
示例#9
0
def linear_svr(dataframe,
               target=None,
               drop_features=[],
               without_outliers=False,
               split=0.2):
    warnings.filterwarnings("ignore",
                            category=ConvergenceWarning,
                            message="^Liblinear failed to converge")

    # Remove non-numerical and undesired features from dataframe
    dataframe = dataframe.loc[:, dataframe.dtypes != 'object']
    dataframe = dataframe.drop(drop_features, axis=1)

    # Transform data into columns and define target variable
    numerical_features = dataframe.loc[:, dataframe.columns != target]
    X = np.nan_to_num(
        numerical_features.to_numpy())  # .reshape(numerical_features.shape)
    y = np.nan_to_num(dataframe[target].to_numpy()
                      )  # .reshape(dataframe[target].shape[0], 1)

    # Split the data into training/testing sets
    testsplit = round(split * X.shape[0])
    X_train = X[:-testsplit]
    X_test = X[-testsplit:]
    y_train = y[:-testsplit]
    y_test = y[-testsplit:]

    # Train linear regression model
    reg = LinearSVR(random_state=0, tol=1e-5)
    reg.fit(X_train, y_train)
    feature_importance = pd.Series(
        reg.coef_[0],
        index=numerical_features.columns)  # only with linear kernel

    # Prediction with trained model
    y_pred = reg.predict(X_test)

    results = pd.DataFrame()
    results['Train mean'] = np.mean(y_train)
    results['Train std'] = np.std(y_train)
    results['Test mean'] = np.mean(y_test)
    results['Test std'] = np.std(y_test)
    results['Prediction mean'] = np.mean(y_pred)
    results['Prediction std'] = np.std(y_pred)
    results['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
    results['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
    results['R2 score'] = r2_score(y_test, y_pred)
    results['Explained variance score'] = explained_variance_score(
        y_test, y_pred)
    results['Cross-val R2 score (mean)'] = np.mean(
        cross_val_score(reg, X, y, cv=10, scoring="r2"))
    results['Cross-val R2 scores'] = cross_val_score(reg,
                                                     X,
                                                     y,
                                                     cv=10,
                                                     scoring="r2")
    results['Cross-val explained_variance score (mean)'] = np.mean(
        cross_val_score(reg, X, y, cv=10, scoring="explained_variance"))
    results['Cross-val explained_variance scores'] = cross_val_score(
        reg, X, y, cv=10, scoring="explained_variance")

    y_result = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    return feature_importance, results, y_result, reg
示例#10
0
文件: build_ml.py 项目: zneha/Auto_TS
def run_ensemble_model(X, Y, modeltype='Regression', scoring='', verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    seed = 99
    if len(X) <= 100000 or X.shape[1] < 50:
        NUMS = 50
        FOLDS = 3
    else:
        NUMS = 20
        FOLDS = 5
    ## create Voting models
    estimators = []
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        scv = ShuffleSplit(n_splits=FOLDS, random_state=seed)
        model5 = LinearRegression()
        results1 = cross_val_score(model5, X, Y, cv=scv, scoring=scoring)
        estimators.append(
            ('Linear Model', model5, np.sqrt(abs(results1.mean()))))
        model6 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
            min_samples_leaf=2, max_depth=1, random_state=seed),
                                   n_estimators=NUMS,
                                   random_state=seed)
        results2 = cross_val_score(model6, X, Y, cv=scv, scoring=scoring)
        estimators.append(('Boosting', model6, np.sqrt(abs(results2.mean()))))
        model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv)
        results3 = cross_val_score(model7, X, Y, cv=scv, scoring=scoring)
        estimators.append(
            ('Linear Regularization', model7, np.sqrt(abs(results3.mean()))))
        ## Create an ensemble model ####
        # estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] # unused
        ensemble = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                    n_estimators=NUMS,
                                    random_state=seed)
        results4 = cross_val_score(ensemble, X, Y, cv=scv, scoring=scoring)
        estimators.append(('Bagging', ensemble, np.sqrt(abs(results4.mean()))))
        if verbose == 1:
            print(
                '\nLinear Model = %0.4f \nBoosting = %0.4f\nRegularization = %0.4f \nBagging = %0.4f'
                % (np.sqrt(abs(results1.mean())) / Y.std(),
                   np.sqrt(abs(results2.mean())) / Y.std(),
                   np.sqrt(abs(results3.mean())) / Y.std(),
                   np.sqrt(abs(results4.mean())) / Y.std()))
        besttype = sorted(estimators, key=lambda x: x[2], reverse=False)[0][0]
        bestmodel = sorted(estimators, key=lambda x: x[2], reverse=False)[0][1]
        bestscore = sorted(estimators, key=lambda x: x[2],
                           reverse=False)[0][2] / Y.std()
        if verbose == 1:
            print('    Best Model = %s with %0.2f Normalized RMSE score\n' %
                  (besttype, bestscore))
    elif modeltype == 'TimeSeries' or modeltype == 'Time Series' or modeltype == 'Time_Series':
        #### This section is for Time Series Models only ####
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        tscv = TimeSeriesSplit(n_splits=FOLDS)
        scoring = 'neg_mean_squared_error'
        model5 = SVR(C=0.1, kernel='rbf', degree=2)
        results1 = cross_val_score(model5, X, Y, cv=tscv, scoring=scoring)
        estimators.append(('SVR', model5, np.sqrt(abs(results1.mean()))))
        model6 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
            min_samples_leaf=2, max_depth=1, random_state=seed),
                                   n_estimators=NUMS,
                                   random_state=seed)
        results2 = cross_val_score(model6, X, Y, cv=tscv, scoring=scoring)
        estimators.append(
            ('Extra Trees', model6, np.sqrt(abs(results2.mean()))))
        model7 = LinearSVR(random_state=seed)
        results3 = cross_val_score(model7, X, Y, cv=tscv, scoring=scoring)
        estimators.append(('LinearSVR', model7, np.sqrt(abs(results3.mean()))))
        ## Create an ensemble model ####
        # estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] # unused
        ensemble = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                    n_estimators=NUMS,
                                    random_state=seed)
        results4 = cross_val_score(ensemble, X, Y, cv=tscv, scoring=scoring)
        estimators.append(('Bagging', ensemble, np.sqrt(abs(results4.mean()))))
        print('Running multiple models...')
        if verbose == 1:
            print(
                '    Instance Based = %0.4f \n    Boosting = %0.4f\n    Linear Model = %0.4f \n    Bagging = %0.4f'
                % (np.sqrt(abs(results1.mean())) / Y.std(),
                   np.sqrt(abs(results2.mean())) / Y.std(),
                   np.sqrt(abs(results3.mean())) / Y.std(),
                   np.sqrt(abs(results4.mean())) / Y.std()))
        besttype = sorted(estimators, key=lambda x: x[2], reverse=False)[0][0]
        bestmodel = sorted(estimators, key=lambda x: x[2], reverse=False)[0][1]
        bestscore = sorted(estimators, key=lambda x: x[2],
                           reverse=False)[0][2] / Y.std()
        if verbose == 1:
            print('Best Model = %s with %0.2f Normalized RMSE score\n' %
                  (besttype, bestscore))
        print('Model Results:')
    else:
        if scoring == '':
            scoring = 'f1'
        scv = StratifiedShuffleSplit(n_splits=FOLDS, random_state=seed)
        model5 = LogisticRegression(random_state=seed)
        results1 = cross_val_score(model5, X, Y, cv=scv, scoring=scoring)
        estimators.append(
            ('Logistic Regression', model5, abs(results1.mean())))
        model6 = LinearDiscriminantAnalysis()
        results2 = cross_val_score(model6, X, Y, cv=scv, scoring=scoring)
        estimators.append(
            ('Linear Discriminant', model6, abs(results2.mean())))
        model7 = ExtraTreesClassifier(n_estimators=NUMS,
                                      min_samples_leaf=2,
                                      random_state=seed)
        results3 = cross_val_score(model7, X, Y, cv=scv, scoring=scoring)
        estimators.append(('Bagging', model7, abs(results3.mean())))
        ## Create an ensemble model ####
        # estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] # unused
        ensemble = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
            random_state=seed, max_depth=1, min_samples_leaf=2),
                                      n_estimators=NUMS,
                                      random_state=seed)
        results4 = cross_val_score(ensemble, X, Y, cv=scv, scoring=scoring)
        estimators.append(('Boosting', ensemble, abs(results4.mean())))
        if verbose == 1:
            print(
                '\nLogistic Regression = %0.4f \nLinear Discriminant = %0.4f \nBagging = %0.4f \nBoosting = %0.4f'
                % (abs(results1.mean()), abs(results2.mean()),
                   abs(results3.mean()), abs(results4.mean())))
        besttype = sorted(estimators, key=lambda x: x[2], reverse=True)[0][0]
        bestmodel = sorted(estimators, key=lambda x: x[2], reverse=True)[0][1]
        bestscore = sorted(estimators, key=lambda x: x[2], reverse=True)[0][2]
        if verbose == 1:
            print('    Best Model = %s with %0.2f %s score\n' %
                  (besttype, bestscore, scoring))
    return bestmodel, bestscore, besttype
def trainModel(param, data, features, feature):
    #we just judge our model
    #so we do not use bagging ,just one loop of CV

    train_feature = features
    pred_label = feature
    feature_valid = ['Ret_PlusOne', 'Ret_PlusTwo', 'Weight_Daily']
    #create CV
    err_cv = []
    std_cv = []
    for run in range(0, 3):
        print "this is run:%d" % (run + 1)
        train_index = loadCVIndex("../../data/cv/train.run%d.txt" % (run + 1))
        test_index = loadCVIndex("../../data/cv/valid.run%d.txt" % (run + 1))
        error_data = data.iloc[test_index]
        X_train = data.iloc[train_index][train_feature]
        X_test = data.iloc[test_index][train_feature]
        Y_train = data.iloc[train_index][pred_label]
        Y_test = data.iloc[test_index][pred_label]
        if param['task'] == 'skl_ridge':
            ridge = Ridge(alpha=param['alpha'], normalize=True)
            ridge.fit(X_train, Y_train)
            pred_value = ridge.predict(X_test)
            pd.DataFrame(ridge.coef_,
                         columns=train_feature).to_csv("ridge.csv")
            pred_value = pd.DataFrame(pred_value, columns=['1', '2'])
            train_data = data.iloc[test_index]
            print train_data.shape
            error_train = Ret_Plus_error(
                pred_value, train_data[feature_valid]) / (40000 * 0.7 * 62)
            print error_train
            variance = 0
            err_cv.append(error_train)
            std_cv.append(variance)
        elif param['task'] == 'skl_lasso':
            lasso = Lasso(alpha=param['alpha'],
                          normalize=True,
                          fit_intercept=True,
                          tol=0.00000000001)
            lasso.fit(X_train, Y_train)
            pred_value = lasso.predict(X_test)
            pred_value = pd.DataFrame(pred_value, columns=['1', '2'])
            train_data = data.iloc[test_index]
            error_train = Ret_Plus_error(pred_value, train_data[feature_valid])
            print error_train
            variance = 0
            err_cv.append(error_train)
            std_cv.append(variance)
        elif param['task'] == 'skl_lr':
            clf = LogisticRegression(C=param['C'])
            clf.fit(X_train, Y_train)
            pred_value = clf.predict(X_test)
            error_train = 1 - accuracy_model(pred_value, Y_test)
            variance = error_train
            err_cv.append(error_train)
            std_cv.append(variance)

        elif param['task'] == 'regression':
            train_data = xgb.DMatrix(X_train, label=np.array(Y_train))
            valid_data = xgb.DMatrix(X_test, label=np.array(Y_test))
            watchlist = [(train_data, 'train'), (valid_data, 'valid')]
            bst = xgb.train(param, train_data, int(param['num_round']),
                            watchlist)
            valid_data = xgb.DMatrix(X_test)
            pred_value = bst.predict(valid_data)
            tmp_data = error_data[feature_valid]
            for feat in pred_label:
                print tmp_data.shape
                print pred_value.shape
                error_train = Ret_Plus_error_xgb(
                    tmp_data, feat, list(pred_value)) / (40000 * 0.3 * 62)
                variance = 0
                err_cv.append(error_train)
                std_cv.append(variance)
                print error_train
        elif param['task'] == 'class':
            train_data = xgb.DMatrix(X_train, label=Y_train)
            valid_data = xgb.DMatrix(X_test, label=Y_test)
            watchlist = [(train_data, 'train'), (valid_data, 'valid')]
            bst = xgb.train(param, train_data, int(param['num_round']),
                            watchlist)
            valid_data = xgb.DMatrix(X_test)
            pred_value = bst.predict(valid_data)
            error_train = 1 - accuracy_model(pred_value, Y_test)
            variance = 0
            err_cv.append(error_train)
            std_cv.append(variance)
            print error_train
        elif param['task'] == 'skl_LibSVM':

            svr = SVR(epsilon=param['epsilon'],
                      tol=param['tol'],
                      cache_size=param['cache_size'],
                      gamma=param['gamma'])
            svr.fit(X_train, Y_train['Ret_PlusOne'])
            pred_value1 = svr.predict(X_test)
            svr.fit(X_train, Y_train['Ret_PlusTwo'])
            pred_value2 = svr.predict(X_test)
            if param['kernel'] == 'linear':
                pd.DataFrame(svr.coef_,
                             columns=train_feature).to_csv("svr.csv")
            pred_value = pd.DataFrame({'1': pred_value1, '2': pred_value2})
            train_data = data.iloc[test_index]
            error_train = Ret_Plus_error(pred_value, train_data[feature_valid])
            print error_train / (40000 * 0.3 * 62)
            variance = 0
            err_cv.append(error_train)
            std_cv.append(variance)
        elif param['task'] == 'skl_linearSVR':
            print param['epsilon']
            print param['C']
            svr = LinearSVR(C=param['C'],
                            epsilon=param['epsilon'],
                            dual=param['dual'],
                            loss=param['loss'],
                            random_state=param['seed'])
            svr.fit(X_train, Y_train['Ret_PlusOne'])
            pred_value1 = svr.predict(X_test)
            svr.fit(X_train, Y_train['Ret_PlusTwo'])
            pred_value2 = svr.predict(X_test)
            pred_value = pd.DataFrame({'1': pred_value1, '2': pred_value2})
            train_data = data.iloc[test_index]
            error_train = Ret_Plus_error(pred_value, train_data[feature_valid])
            print error_train / (40000 * 0.3 * 62)
            variance = 0
            err_cv.append(error_train)
            std_cv.append(variance)
    #print "error.train:%f error.test:%f"%(error_train,error)
    error = np.mean(err_cv)
    std_cv = np.mean(err_cv)
    print "error:%f" % (error)
    return {
        'loss': error,
        'attachments': {
            'std': variance
        },
        'status': STATUS_OK
    }
示例#12
0
print('Header Test Rows')
print(datatest.head())

#dictvalues ={}
#for coldata in data_new.columns:
#    dictvalues[coldata] = datatest[coldata].mean()

#print('dictvalues values')
#print(dictvalues)

##print('sorted output')
#from operator import itemgetter
#print(sorted(dictvalues.items(), key=itemgetter(1),reverse=True))

#regr = linear_model.Lasso(alpha=0.1)
regr = LinearSVR(C=1.0, epsilon=0.2)
#regr = RandomForestRegressor()
#regr = AdaBoostRegressor(n_estimators=80)
regr.fit(data_new[features], y)

predictions = regr.predict(datatest)
print('predictions')
print(predictions)

datatest_result = pd.read_csv('test.csv',header=0)
datatest_result['loss'] = np.exp(predictions)
header = ["id","loss"]
datatest_result.to_csv("Results_AllState_SVR_81.csv", sep=',', columns = header,index=False)

for col in data.columns[:-1]: 
    print(data[col].unique())
示例#13
0
def _training_results(data_dict, split_test, k_fold, training_type_list):
    """Execute supervised training for each training algorithm type.
    Options:
        - split_test: decimal number percentages (recommended 0.1 to 0.3)
        - k_fold: integer number (recommended 5 or 10)
        - training_type_list: algorithms options
            ['logistic_regression', 'decision_tree', 'svm_svc_linear',
             'svm_svc_rbf', 'svm_linear_svr ,'multinomial_nb',
             'random-forest', 'kneighbors', 'stochastic-gradient-descent-log',
             'stochastic-gradient-descent-svm']
        - [OUTPUT] final_results: e.g.
            - {'training_test': [{'name': 'Logistic Regression', 'accuracy': 0.9531331',
                                  'classification_report': '
                                       precision    recall  f1-score   support

                                        0.0       0.95      0.95      0.95      4449
                                        1.0       0.95      0.95      0.95      4449

                                  avg / total       0.95      0.95      0.95      8898',
                                  'confusion_matrix': '
                                        [[4233  216]
                                        [ 229 4220]]'
                                  }],
              {'cross_validation': [{'name': 'Logistic Regression', 'accuracy': 0.9531331', ...}]}

    :param data_dict:
    :param split_test:
    :param k_fold:
    :param training_type_list:
    :return [object] final results for each methodology:
    """
    final_results = {'training_test': [], 'cross_validation': []}

    for training_type in training_type_list:
        result_dict = {
            'name': '',
            'accuracy': None,
            'classification_report': None,
            'confusion_matrix': None
        }

        training_results = []

        if training_type is not None:
            if training_type == 'logistic_regression':
                result_dict['name'] = 'Logistic Regression'
                model = LogisticRegression()

            elif training_type == 'decision_tree':
                result_dict['name'] = 'Decision Tree'
                model = DecisionTreeClassifier()

            elif training_type == 'svm_svc_linear':
                result_dict['name'] = 'SVM SVC Linear'
                model = SVC(kernel='linear', C=C, verbose=True)

            elif training_type == 'svm_svc_rbf':
                result_dict['name'] = 'SVM SVC RBF'
                model = SVC(kernel='rbf', C=C, verbose=True)

            elif training_type == 'svm_linear_svr':
                result_dict['name'] = 'SVM Linear SVR'
                model = LinearSVR(C=C, verbose=True)

            elif training_type == 'multinomial_nb':
                result_dict['name'] = 'Multinomial Naive Bayes'
                model = MultinomialNB()

            elif training_type == 'random-forest':
                result_dict['name'] = 'Random Forest'
                model = RandomForestClassifier()

            elif training_type == 'kneighbors':
                result_dict['name'] = 'KNN'
                model = KNeighborsClassifier(n_neighbors=num_neighbors)

            elif training_type == 'stochastic-gradient-descent-log':
                result_dict[
                    'name'] = 'Stochastic Gradient Descent - Logistic Regression'
                model = SGDClassifier(loss='log')

            elif training_type == 'stochastic-gradient-descent-svm':
                result_dict[
                    'name'] = 'Stochastic Gradient Descent - Linear SVM'
                model = SGDClassifier(loss='hinge')

            training_results = _process_training(data_dict, result_dict, model,
                                                 split_test, k_fold)
        else:
            print 'ML not implemented for ' + training_type

        if training_results['training_test'] is not None:
            final_results['training_test'].append(
                training_results['training_test'])

        elif training_results['cross_validation'] is not None:
            final_results['cross_validation'].append(
                training_results['cross_validation'])

    return final_results
示例#14
0
ridge = TestModel(Ridge(), df_f.drop('timestamp', inplace=False, axis=1),
                  df_t.drop('timestamp', inplace=False, axis=1))

#%%

ada = TestModel(AdaBoostRegressor(random_state=6),
                df_f.drop('timestamp', inplace=False, axis=1), df_t['B_C2H6'])

#%%

svr = TestModel(SVR(), df_f.drop('timestamp', inplace=False, axis=1),
                df_t['B_C2H6'])

#%%

lsvr = TestModel(LinearSVR(), df_f.drop('timestamp', inplace=False, axis=1),
                 df_t['B_C2H6'])

#%%

#isotonic = TestModel(IsotonicRegression(), df_f.drop('timestamp', inplace=False, axis=1), df_t.drop('timestamp', inplace=False, axis=1))

#%%

df_test = pd.read_csv('test_features.csv', parse_dates=['timestamp'])
buildDF(df_test)

model_lasso = Lasso(random_state=6).fit(
    df_f.drop('timestamp', inplace=False, axis=1),
    df_t.drop('timestamp', inplace=False, axis=1))
df_pred_lasso = model_lasso.predict(
示例#15
0
# Tuning models and test for all features 
# Linear Regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
acc_model(0,linreg,X_train,X_test)    
print("Done")

# Support Vector Machines
svr = SVR()
svr.fit(X_train, y_train)
acc_model(1,svr,X_train,X_test)
print("Done")

# Linear SVR
linear_svr = LinearSVR()
linear_svr.fit(X_train, y_train)
acc_model(2,linear_svr,X_train,X_test)
print("Done")

# MLPRegressor
mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [i for i in range(2,20)],
              'activation': ['relu'],
              'solver': ['adam'],
              'learning_rate': ['constant'],
              'learning_rate_init': [0.01],
              'power_t': [0.5],
              'alpha': [0.0001],
              'max_iter': [1000],
              'early_stopping': [True],
示例#16
0
def run_train_all_sklearn(file, fp_name, cv=5, verbose=0, seed=1):

    np.random.seed(seed)
    c = defaultdict(list)

    for k in ProgIter([
            'synergy_zip', 'synergy_bliss', 'synergy_loewe', 'synergy_hsa',
            'css_ri', 'name'
    ],
                      verbose=verbose,
                      total=5):
        v = file[k]

        if k != 'name':
            temp = dict(
            )  # for results storage. Assuming that "name" comes last

            if 'drug_row_col' in v.columns:
                v.drop(columns=['drug_row_col'], inplace=True)

            cat_cols = ['cell_line_name']
            categories = [
                v[column].unique() for column in v[cat_cols]
            ]  # manually find all available categories for one-hot

            # pipelines
            encode = Pipeline(steps=[('one-hot-encode',
                                      OneHotEncoder(categories=categories))])
            processor = ColumnTransformer(transformers=[
                ('cat_encoding', encode, cat_cols), ('dropping', 'drop', [k])
            ],
                                          remainder='passthrough')

            catbst = ColumnTransformer(transformers=[('dropping', 'drop', [k])
                                                     ],
                                       remainder='passthrough')

            # regressions
            lr = make_pipeline(processor, linear_model.LinearRegression())
            ridge = make_pipeline(processor, linear_model.Ridge())
            lasso = make_pipeline(processor, linear_model.Lasso())
            elastic = make_pipeline(processor, linear_model.ElasticNet())
            lassolars = make_pipeline(processor, linear_model.LassoLars())
            b_ridge = make_pipeline(processor, linear_model.BayesianRidge())
            kernel = DotProduct() + WhiteKernel()
            gpr = make_pipeline(processor,
                                GaussianProcessRegressor(kernel=kernel))
            linSVR = make_pipeline(processor, LinearSVR())
            hist_gbr = make_pipeline(
                processor,
                HistGradientBoostingRegressor(warm_start=True, max_depth=6))
            rfr = make_pipeline(
                processor,
                RandomForestRegressor(warm_start=True, max_depth=6, n_jobs=3))
            iso = make_pipeline(processor,
                                IsotonicRegression(increasing='auto'))
            xgb = make_pipeline(
                processor, XGBRegressor(tree_method='gpu_hist', max_depth=6))
            cbt = make_pipeline(
                catbst,
                CatBoostRegressor(task_type='GPU',
                                  depth=6,
                                  cat_features=np.array([0]),
                                  verbose=False))

            mls = [
                cbt, rfr, gpr, hist_gbr, lr, ridge, lasso, elastic, lassolars,
                b_ridge, gpr, linSVR, iso
            ]
            mls_names = [
                "cbt", "rfr", "gpr", "hist_gbr", "lr", "ridge", "lasso",
                "elastic", "lassolars", "b_ridge", "gpr", "linSVR", "iso"
            ]

            # results
            start = time.time()
            for MODEL, name in zip(mls, mls_names):
                print(f'\n{name}')
                if 'cbt' == name:
                    n_jobs = 1
                else:
                    n_jobs = cv
                cv_dict = cross_validate(
                    MODEL,
                    v,
                    v[k],
                    cv=cv,
                    scoring={
                        "pearsonr": pearson,
                        "rmse": rmse
                    },
                    return_train_score=False,
                    verbose=verbose,
                    n_jobs=n_jobs,
                )
                temp[name] = {
                    'test_pearsonr': np.nanmean(cv_dict['test_pearsonr']),
                    'test_rmse': abs(np.nanmean(cv_dict['test_rmse']))
                }
                print(temp[name])
            print(f'{k} took {int(time.time()-start)/60} mins')

            c[k] = temp
        else:
            nm = f'/tf/notebooks/code_for_pub/_logs_as_python_files/{fp_name}_13models_5foldCV_{time.ctime()}.pickle'
            with open(nm, 'wb') as file:
                pickle.dump(c, file)
            print(f'saving complete to {nm}')
    return c
import pandas as pd

df = pd.read_csv("data.csv")

y = df.pop("threshold")
X = df

from sklearn.svm import LinearSVR
svr = LinearSVR(epsilon=0.2)
svr.fit(X, y)

print(svr.intercept_)
print(svr.coef_)
def main():

    MAX_ITER = 5000
    regressors = {
        "SVR": lambda: SVR(max_iter=MAX_ITER),
        "SVR_lin": lambda: LinearSVR(max_iter=MAX_ITER),
        "DTR": lambda: DecisionTreeRegressor(),
        "KNN": lambda: KNeighborsRegressor(n_neighbors=10),
        "MLP": lambda: MLPRegressor([256] * 3),
        "MLP_large": lambda: MLPRegressor([1024] * 5),
        "DUMMY": lambda: DummyRegressor()
    }

    parser = ArgumentParser()
    parser.add_argument("input_file", help="Input data csv file")
    parser.add_argument("-output_file",
                        help="Output data csv file",
                        default="regression_results.csv")
    parser.add_argument("-folds", help="Number of folds", default=10)
    parser.add_argument("-seed", help="Random seed", default=1337)
    parser.add_argument("-regressors",
                        help="Regressors to use",
                        default=" ".join(regressors.keys()))

    args = parser.parse_args()

    # opts

    # load data
    # Expected csv format:
    # exp_id	representation	filename	inst_frac	feat_frac	classifier	fold
    # accuracy	nr_instances	nr_features	nr_missing_values	mean_kurtosis	mean_skewness
    # mean	Info_gain	Inf_gain_ratio

    input_file = args.input_file
    regressor_names = args.regressors.split()
    num_folds = args.folds
    output_path = args.output_file
    seed = args.seed

    input_data = pd.read_csv(input_file)
    print("Read raw input data with shape:", input_data.shape)

    # Given the input data, potentially aggregate some accuracy values (e.g. over all folds)
    data, labels = get_data_and_labels_from_raw_inputs(input_data)
    print(f"Running regressions on {len(data)} data/label instances.")

    print("Run regression in terms of a single representation? GG clarify!")

    columns = ["id", "fold", "mse", "mae"]
    results = pd.DataFrame(columns=columns)
    results.to_csv(output_path, index=None)

    # cross-val
    splitter = KFold(num_folds, shuffle=True, random_state=seed)
    # iterate over folds
    for fold_idx, (train_idx, test_idx) in enumerate(splitter.split(data)):
        x_train, y_train = data[train_idx, :], labels[train_idx]
        x_test, y_test = data[test_idx, :], labels[test_idx]
        # iterate over regressors
        for regressor_name in regressor_names:
            model_func = regressors[regressor_name]
            # run the regression
            print(f"Running fold {fold_idx+1}/{num_folds} : {regressor_name}")
            regressor = model_func()
            mse, mae = run_regression(regressor,
                                      x_train,
                                      y_train,
                                      x_test,
                                      y_test,
                                      mmx_scale=(regressor_name == "dtr"))
            # get a run id, store results
            run_id = f"regressor_{regressor_name}"
            results = results.append(
                {
                    "id": run_id,
                    "fold": fold_idx,
                    "mse": mse,
                    "mae": mae
                },
                ignore_index=True)
            # backup a copy
            shutil.copyfile(output_path, output_path + " .backup.csv")
            results.to_csv(output_path, index=None)
    print("Done!")
    print("Avg. per classifier:")
    print(results.groupby("id").mean())
    y_te = np.zeros([test.shape[0]])
    y_pred = np.zeros_like(y_true,dtype='float64')
    kf = KFold(n_splits=n_kf).split(train)
    for j,(train_index,test_index) in enumerate(kf):
        rgs.fit( train[train_index,:], y_true[train_index] )
        y_pred[test_index] = rgs.predict( train[test_index] )
        y_te += rgs.predict( test )
    y_te /= n_kf
    print( '{0} score {1}'.format(ln,feval(y_true,y_pred)))
    return y_pred,y_te,feval(y_true,y_pred)
        

train_true = DataFrame(o.get_table('y_train')).to_pandas()

rgss = [LinearRegression(),
        LinearSVR(C=0.01),
        Ridge(alpha = 1.0)]
rgss_name = ['LR','SVR','Ridge',]

train_s = []
test_s = []
for i,rn in enumerate(rgss_name):
    n_kf = 10
    y_valid = np.zeros([train_1.shape[0],5])
    y_test = np.zeros([test_1.shape[0],5])   
    for j,ln in enumerate(label_name):
        y_true = train_true[ln].as_matrix()
        rgs = rgss[i]
        y_va,y_te,sc = stack_cell(rgs,n_kf,ln,y_true,(train_1,test_1),(train_2,test_2),(train_3,test_3),(train_4,test_4),(train_5,test_5),(train_6,test_6),
                                 (train_7,test_7),(train_8,test_8))
        y_valid[:,j] = y_va
def StandardLinearSVR(epsilon=0.1):
    return Pipeline([('std_scaler', StandardScaler()),
                     ('linearSVR', LinearSVR(epsilon=epsilon))])
示例#21
0
                    t1[i],
                    cmidd(
                        X.iloc[:, i],  # feature i
                        y,  # target
                        X.iloc[:, int(F[int(
                            m[i])])]  # conditionned on selected features
                    ))
            if t1[i] > sstar:
                sstar = t1[i]
                F[k + 1] = i

F = np.array(F[F > -100])
F = F.astype(int)
t1 = t1[F]

regr = make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-3))

X_train_data = X.iloc[:, F[:10]]
regr.fit(X_train_data, y_train_data)
y_pred = regr.predict(X_test_data.iloc[:, F[:10]])

print(sum((y_pred - y_test_data)**2))
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_pred, y_test_data))

error_rate = []
for i in range(1, 40):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train_data, y_train_data)
    pred_i = knn.predict(X_test_data.iloc[:, F[:10]])
    error_rate.append(np.mean(pred_i != y_test_data))
示例#22
0
y1 = train['SentimentTitle']

train_X_Headline = hstack(
    [train_vect_2_hst, csr_matrix(train_headline.values)])
test_X_Headline = hstack([test_vect_2_hst, csr_matrix(test_headline.values)])
y2 = train['SentimentHeadline']

np.shape(train_X_Title)

#model for sentiment title
X_train, X_test, y_train, y_test = train_test_split(train_X_Title,
                                                    y1,
                                                    test_size=0.20,
                                                    random_state=42)

LSVR1 = LinearSVR(C=0.2)
LSVR1.fit(X_train, y_train)

y_pred1 = LSVR1.predict(X_test)
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', 1 - mae1)

X_train, X_test, y_train, y_test = train_test_split(train_X_Headline,
                                                    y2,
                                                    test_size=0.20,
                                                    random_state=42)

LSVR2 = LinearSVR(C=0.1)
LSVR2.fit(X_train, y_train)

y_pred2 = LSVR2.predict(X_test)
示例#23
0
# # Regression
#

# In[22]:

np.random.seed(42)
m = 50
X = 2 * np.random.rand(m, 1)
y = (4 + 3 * X + np.random.randn(m, 1)).ravel()

# In[23]:

from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=1.5, random_state=42)
svm_reg.fit(X, y)

# In[24]:

svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)
svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)
svm_reg1.fit(X, y)
svm_reg2.fit(X, y)


def find_support_vectors(svm_reg, X, y):
    y_pred = svm_reg.predict(X)
    off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)
    return np.argwhere(off_margin)
示例#24
0
    # Exercise 10 P166

    # data set
    housing = fetch_california_housing()
    X = housing["data"]
    y = housing["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # build model
    lin_svr = LinearSVR(random_state=42)
    lin_svr.fit(X_train_scaled, y_train)

    y_pred = lin_svr.predict(X_train_scaled)
    mse = mean_squared_error(y_train, y_pred)
    print('LinearSVR MSE: ', mse)  # 0.949968822217229 not good
    print('LinearSVR RMSE: ', np.sqrt(mse))

    # grid search the best estimator with SVR() model which can use kernel skill
    param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
    rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42)
    rnd_search_cv.fit(X_train_scaled, y_train)

    print('best estimator: ', rnd_search_cv.best_estimator_)
    '''SVR(C=4.745401188473625, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
      gamma=0.07969454818643928, kernel='rbf', max_iter=-1, shrinking=True,
示例#25
0
    AdaBoostRegressor(DecisionTreeRegressor(random_state=13,
                                            min_samples_leaf=5),
                      random_state=13,
                      n_estimators=17), "AdaBoostHousing")
build_housing(BayesianRidge(), "BayesianRidgeHousing")
build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors=True)
build_housing(
    MLPRegressor(activation="tanh",
                 hidden_layer_sizes=(26, ),
                 solver="lbfgs",
                 random_state=13,
                 tol=0.001,
                 max_iter=1000), "MLPHousing")
build_housing(SGDRegressor(random_state=13), "SGDHousing")
build_housing(SVR(), "SVRHousing")
build_housing(LinearSVR(random_state=13), "LinearSVRHousing")
build_housing(NuSVR(), "NuSVRHousing")

#
# Anomaly detection
#


def build_iforest_housing(iforest, name, **pmml_options):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
示例#26
0
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:-0.0028642237563477587
exported_pipeline = make_pipeline(
    StackingEstimator(
        estimator=GradientBoostingRegressor(alpha=0.85,
                                            learning_rate=0.01,
                                            loss="lad",
                                            max_depth=2,
                                            max_features=0.15000000000000002,
                                            min_samples_leaf=7,
                                            min_samples_split=7,
                                            n_estimators=100,
                                            subsample=0.4)), MinMaxScaler(),
    StackingEstimator(estimator=LinearSVR(C=1.0,
                                          dual=True,
                                          epsilon=0.0001,
                                          loss="epsilon_insensitive",
                                          tol=1e-05)),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    RandomForestRegressor(bootstrap=False,
                          max_features=0.45,
                          min_samples_leaf=6,
                          min_samples_split=3,
                          n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
示例#27
0
                                            coef0=1,
                                            C=5))))
    poly_kernel_svm_clf.fit(X, y)

    rbf_kernel_svm_clf = Pipeline(
        (("scaler", StandardScaler()), ("svm_clf",
                                        SVC(kernel="rbf", gamma=5, C=0.001))))
    rbf_kernel_svm_clf.fit(X, y)
    """
    LinearSVC比SVC快得多(ker nel =“linear”)),特别是如果训练集非常大或者它有很多特征。
    如果训练集不太大,则应该尝试高斯RBF内核;它在大多数情况下运作良好。
    
    """

if False:
    from sklearn.svm import LinearSVR
    """
    epsilon -> street width
    C large regularization small
    
    """
    svm_reg = LinearSVR(epsilon=1.5)
    svm_reg.fit(X, y)
    """
    SVR类是SVC类的回归等价物,LinearSVR类是LinearSVC类的回归等价物。 
    LinearSVR类与训练集的大小成线性关系(就像LinearSVC类一样),而当训练集变大时SVR类变得太慢(就像SVC类一样)
    """
    from sklearn.svm import SVR

    svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
    svm_poly_reg.fit(X, y)
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler, PolynomialFeatures
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.633119693298434
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    SelectPercentile(score_func=f_regression, percentile=89), MaxAbsScaler(),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    LinearSVR(C=1.0,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=0.0001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
示例#29
0
                                             min_child_weight=3,
                                             n_estimators=50,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
                                             power_t=0.0)),
    StackingEstimator(estimator=LinearSVR(
        C=25.0, dual=True, epsilon=0.1, loss="epsilon_insensitive",
        tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"),
    SelectPercentile(score_func=f_regression, percentile=6),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False,
                                                    max_features=0.8,
                                                    min_samples_leaf=19,
                                                    min_samples_split=10,
                                                    n_estimators=400)),
    ZeroCount(), FeatureAgglomeration(affinity="l2", linkage="complete"),
    StackingEstimator(estimator=RidgeCV()), RidgeCV())
exported_pipeline.fit(X, y)

print(r2_score(y, exported_pipeline.predict(X)))

_model = open("Tpot_bestmodel.pkl", "wb")
pickle.dump(exported_pipeline, _model)
示例#30
0
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

models = [
    LinearRegression(),
    Ridge(),  # http://www.cnblogs.com/pinard/p/6023000.html
    Lasso(
        alpha=0.01, max_iter=10000
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
    RandomForestRegressor(
    ),  # https://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestRegressor.html
    GradientBoostingRegressor(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
    SVR(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR
    LinearSVR(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html
    ElasticNet(
        alpha=0.001, max_iter=10000
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
    SGDRegressor(
        max_iter=10000, tol=1e-3
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
    BayesianRidge(),  # 
    KernelRidge(
        alpha=0.6, kernel='polynomial', degree=2, coef0=2.5
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html
    ExtraTreesRegressor(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html
    XGBRegressor(),
    AdaBoostRegressor(
        n_estimators=50