Exemplo n.º 1
0
 def helper(X_train, Y_train):
     reg = linear_model.ElasticNet(alpha=alpha)
     reg.fit(X_train, Y_train)
     return reg.predict
Exemplo n.º 2
0
def model_selector(model):
    # Ye good ol ugly if-elif switch to choose the model
    if model == "linear":
        regr = linear_model.LinearRegression(n_jobs=-1)

    elif model == "lasso":
        regr = linear_model.Lasso(random_state=17)

    elif model == "elasticnet" or model == "elastic":
        regr = linear_model.ElasticNet(random_state=17)

    elif model == "bayesian":
        regr = linear_model.BayesianRidge()

    elif model == "decision tree regressor" or model == "dtr":
        regr = tree.DecisionTreeRegressor(max_depth=8, min_samples_leaf=17, random_state=17)

    elif model == "tweedie regressor 0" or model == "normal distribution":
        regr = linear_model.TweedieRegressor(power=0)

    elif model == "tweedie regressor 1" or model == "poisson distribution":
        regr = linear_model.TweedieRegressor(power=1)

    elif model == "extra trees regressor" or model == "etr":
        regr = ensemble.ExtraTreesRegressor(max_depth=8, min_samples_leaf=17, random_state=17)

    elif model == "random forest regressor" or model == "rfr":
        regr = ensemble.RandomForestRegressor(n_estimators=500, oob_score=True, random_state=17, n_jobs=-1)

    elif model == "adaboost extra trees" or model == "boost et":
        regr = AdaBoostRegressor(ensemble.ExtraTreesRegressor(max_depth=8, min_samples_leaf=17, random_state=17),
                                 n_estimators=500, random_state=17)

    elif model == "k neighbours" or model == "k neighbor":
        regr = neighbors.KNeighborsRegressor(n_jobs=-1)

    elif model == "gradient boosting regressor" or model == "gbr":
        regr = ensemble.GradientBoostingRegressor(random_state=17)

    elif model == "voting":
        clf1 = linear_model.LinearRegression(n_jobs=-1)
        clf2 = ensemble.RandomForestRegressor(max_depth=8, min_samples_leaf=17, random_state=17, n_jobs=-1)
        clf3 = ensemble.GradientBoostingRegressor(random_state=17)
        regr = ensemble.VotingRegressor(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], n_jobs=-1)

    elif model == "logistic":
        regr = linear_model.LogisticRegression(max_iter=250, random_state=17, n_jobs=-1)

    elif model == "gaussian":
        regr = GaussianNB()

    elif model == "decision tree classifier" or model == "dtc":
        regr = tree.DecisionTreeClassifier(max_depth=8, min_samples_leaf=17, random_state=17)

    elif model == "extra tree classifier" or model == "etc":
        regr = ensemble.ExtraTreesClassifier(max_depth=8, min_samples_leaf=17, random_state=17)

    elif model == "random forest classifier" or model == "rfc":
        regr = ensemble.RandomForestClassifier(max_depth=8, min_samples_leaf=17, random_state=17)

    elif model == "linear svc":
        regr = svm.LinearSVC(random_state=17)

    elif model == "k neighbour classifier" or model == "k neighbor classifier":
        regr = neighbors.KNeighborsClassifier(n_jobs=-1, n_neighbors=2)

    elif model == "svc":
        regr = svm.SVC(kernel="rbf", probability=True, random_state=17)

    return regr
# Generate sample data
n_samples_train, n_samples_test, n_features = 75, 150, 500
np.random.seed(0)
coef = np.random.randn(n_features)
coef[50:] = 0.0  # only the top 10 features are impacting the model
X = np.random.randn(n_samples_train + n_samples_test, n_features)
y = np.dot(X, coef)

# Split train and test data
X_train, X_test = X[:n_samples_train], X[n_samples_train:]
y_train, y_test = y[:n_samples_train], y[n_samples_train:]

###############################################################################
# Compute train and test errors
alphas = np.logspace(-5, 1, 60)
enet = linear_model.ElasticNet(l1_ratio=0.7)
train_errors = list()
test_errors = list()
for alpha in alphas:
    enet.set_params(alpha=alpha)
    enet.fit(X_train, y_train)
    train_errors.append(enet.score(X_train, y_train))
    test_errors.append(enet.score(X_test, y_test))

i_alpha_optim = np.argmax(test_errors)
alpha_optim = alphas[i_alpha_optim]
print("Optimal regularization parameter : %s" % alpha_optim)

# Estimate the coef_ on full data with optimal regularization parameter
enet.set_params(alpha=alpha_optim)
coef_ = enet.fit(X, y).coef_
Exemplo n.º 4
0
def lr_model(df):
    # dist_lambda=lambda x: distance.euclidean(x,(-122.3380,47.6075))
    # location=list(zip(df["longitude"],df["latitude"]))
    # distances=list(map(dist_lambda,location))
    # # distance from Seattle art museum
    # df['distances']=distances
    df = df[np.abs(df['price'] - df['price'].mean()) <= (3 *
                                                         df['price'].std())]

    numerical_columns = [
        "host_response_rate", "host_listings_count", "latitude", "longitude",
        "accommodates", "bathrooms", "bedrooms", "beds", "square_feet",
        "guests_included", "extra_people", "minimum_nights", "maximum_nights",
        "availability_30", "availability_60", "availability_90",
        "availability_365", "number_of_reviews", "review_scores_rating",
        "review_scores_accuracy", "review_scores_cleanliness",
        "review_scores_checkin", "review_scores_communication",
        "review_scores_location", "review_scores_value",
        "calculated_host_listings_count", "reviews_per_month", "distances"
    ]
    categorical_columns = [
        "host_is_superhost", "host_neighbourhood", "host_verifications",
        "host_has_profile_pic", "host_identity_verified", "neighbourhood",
        "neighbourhood_cleansed", "neighbourhood_group_cleansed",
        "is_location_exact", "property_type", "room_type", "calendar_updated",
        "has_availability", "requires_license", "instant_bookable",
        "cancellation_policy", "require_guest_profile_picture",
        "require_guest_phone_verification", "bed_type", "amenities"
    ]
    # std = StandardScaler()
    # data=df[numerical_columns].to_numpy()
    # std.fit(data)
    # df1 = pd.DataFrame(data)
    # df1.columns=numerical_columns
    # df2=df[categorical_columns]
    # df3=df['price']
    # df=pd.concat([df1,df2],axis=1,sort=False)
    # df['price']=df3
    # X = df[categorical_columns + numerical_columns]
    # y = df['price']
    X = df[categorical_columns + numerical_columns]
    y = df['price']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=1)

    categorical_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('TruncatedSVD',
         TruncatedSVD(n_components=30, n_iter=7, random_state=42))
    ])
    numerical_pipe = Pipeline([('imputer',
                                SimpleImputer(missing_values=np.nan,
                                              strategy='median')),
                               ('outliers', RobustScaler()),
                               ('pca', decomposition.PCA(n_components=25))])
    preprocessing = ColumnTransformer([
        ('cat', categorical_pipe, categorical_columns),
        ('num', numerical_pipe, numerical_columns)
    ])

    slr = Pipeline([('preprocess', preprocessing),
                    ('classifier',
                     linear_model.ElasticNet(alpha=0.1, l1_ratio=0.5))])

    slr.fit(X_train, y_train)

    y_train_pred = slr.predict(X_train)
    y_test_pred = slr.predict(X_test)

    print("===testing accuracy===")
    # The mean squared error
    print('Mean squared error: %.2f' % mean_squared_error(y_test, y_test_pred))
    # The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f' % r2_score(y_test, y_test_pred))

    fig, ax = plt.subplots()
    n = len(y_test)

    ax.scatter(y_test.values,
               y_test_pred,
               c='tab:blue',
               label='tab:blue',
               alpha=0.3,
               edgecolors='none')
    plt.xlabel('real price')
    plt.ylabel('predicted price')
    print("===training accuracy===")
    # The mean squared error
    print('Mean squared error: %.2f' %
          mean_squared_error(y_train, y_train_pred))
    # The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f' %
          r2_score(y_train, y_train_pred))

    fig, ax = plt.subplots()
    n = len(y_train)

    ax.scatter(y_train.values,
               y_train_pred,
               c='tab:blue',
               label='tab:blue',
               alpha=0.3,
               edgecolors='none')
    plt.xlabel('real price')
    plt.ylabel('predicted price')
    plt.show()
    return slr
Exemplo n.º 5
0
def fucking_paul(stock, log, Kin, Din, save_max, max_len, bitchCunt,
                 tradeCost):
    arr = []
    buy = []
    sell = []
    diff = []
    perc = []
    cumld = []
    kar = []
    dar = []
    cumld = []
    kar1 = []
    dar1 = []
    Kvl = np.zeros(2)
    Dvl = Kvl
    s1ar = []
    s2ar = []
    shortDiff = []
    cuml = 1.0
    mdd = 0
    position = 0
    stopLoss = False
    bull = 0
    shit = 0
    maxP = 0
    minP = 0

    for i, closeData in enumerate(stock):
        model = linear_model.ElasticNet(alpha=29.91)
        arr.append(closeData)
        trainX, trainY = create_dataset(arr, Din)
        if i > Kin:
            rarr = np.reshape(arr[-Din:], [1, -1])
            model.fit(trainX, trainY)
            p = model.predict(rarr)
            if ((p > closeData) and position == -1 or position == 0):
                buy.append(closeData * (1 + tradeCost))
                bull += 1
                position = 1
            elif (p < closeData) and position == 1 or position == 0:
                sell.append(closeData * (1 - tradeCost))
                maxP = 0
                shit += 1
                position = -1
            if position == 1 and closeData > maxP:
                maxP = closeData
            elif position == -1 or position == 0:
                maxP = closeData
            if position == -1 and closeData < minP:
                minP = closeData
            elif position == 1 or position == 0:
                minP = closeData
            #DYNAMIC BITCHCUNT DISTANCE IN DEVELOPMENT
            #WILL BE BASED ON ANALYSIS OF VARIANCE, AND CORRELATION WITH ENVIRONMENTAL VOLITILITY
            if (closeData < (maxP * (1 - bitchCunt)) and position == 1):
                sell.append(closeData * (1 - tradeCost))
                maxP = 0
                shit += 1
                position = -1
            if (closeData > (minP * (1 + bitchCunt)) and position == -1):
                buy.append(closeData * (1 + tradeCost))
                shit += 1
                position = 1
    if position == 1:
        sell.append(stock[len(stock) - 1])
        shit += 1
    for i in range(bull):
        diff.append(sell[i] - buy[i])
        if i < bull - 1:
            shortDiff.append(sell[i] - buy[i + 1])
    for i in range(bull):
        perc.append(diff[i] / buy[i])
    for i in range(bull - 1):
        perc[i] += shortDiff[i] / sell[i]
    for i in range(bull):
        cumld.append(cuml)
        cuml += cuml * perc[i]

    for i in range(len(cumld)):
        if i > 1:
            peak = max(cumld[:i])
            trough = min(cumld[i:])
            dd = (peak - trough) / peak
            if dd > mdd:
                mdd = dd

    print("tik:", log, "cuml:", cuml)

    if cuml > save_max and len(perc) <= max_len:
        write_that_shit(log, Kin, Din, perc, cuml, mdd, bitchCunt)
        print(
            f'\tYEEEEEE BOIIIIS: Kin: {Kin} Din: {Din} BitchCunt: {bitchCunt} MDD = {mdd} LEN = {len(perc)} CUML = {cuml}'
        )
        # DONT F*****G MOVE/INDENT WRITE_THAT_SHIT!!!!
        # if mdd < 0.5:
        #     plot(cumld)
        # plot2(s1ar, s2ar)
    return cuml
Exemplo n.º 6
0
    0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000
]
myML.plotML.PlotParam_Score(X_train,
                            X_test,
                            Y_train,
                            Y_test,
                            "linear_model.Lasso()",
                            logX=True,
                            alpha=alphas)

# ---ElasticNet
data = myML.DataPre.load_datasets(mode="diabetes")
X_train, X_test, Y_train, Y_test = myML.DataPre.train_test_split(
    data.data, data.target, test_size=0.25, random_state=0)
from sklearn import linear_model
regr = linear_model.ElasticNet().fit(X_train, Y_train)
myML.LinearModel.showModelTest(regr, X_test, Y_test)
# test alpha and rhos
alphas = np.logspace(-1, 1)
rhos = np.linspace(0.01, 1)
myML.plotML.PlotParam_Score(X_train,
                            X_test,
                            Y_train,
                            Y_test,
                            "linear_model.ElasticNet()",
                            drawParam=2,
                            plot3D=True,
                            alpha=alphas,
                            l1_ratio=rhos)

# ---logistic 回归
# Generate sample data
n_samples_train, n_samples_test, n_features = 75, 150, 500
np.random.seed(0)
coef = np.random.randn(n_features)
coef[50:] = 0.0  # only the top 10 features are impacting the model
X = np.random.randn(n_samples_train + n_samples_test, n_features)
y = np.dot(X, coef)

# Split train and test data
X_train, X_test = X[:n_samples_train], X[n_samples_train:]
y_train, y_test = y[:n_samples_train], y[n_samples_train:]

# #############################################################################
# Compute train and test errors
alphas = np.logspace(-5, 1, 60)
enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=10000)
train_errors = list()
test_errors = list()
for alpha in alphas:
    enet.set_params(alpha=alpha)
    enet.fit(X_train, y_train)
    train_errors.append(enet.score(X_train, y_train))
    test_errors.append(enet.score(X_test, y_test))

i_alpha_optim = np.argmax(test_errors)
alpha_optim = alphas[i_alpha_optim]
print("Optimal regularization parameter : %s" % alpha_optim)

# Estimate the coef_ on full data with optimal regularization parameter
enet.set_params(alpha=alpha_optim)
coef_ = enet.fit(X, y).coef_
Exemplo n.º 8
0
warnings.filterwarnings('ignore')

save_file = False
enable_stacking = True
enable_add_result = False
enable_lstm = False
lstm_file = 'result_Dec_16_11-06-09_LSTM_batched_1.89311.csv'
data_file = ["data/new_mixed_machine{}.csv".format(i) for i in range(1, 7)]
dl_norm = [DataFrameMix(data_file[i]) for i in range(6)]
# dl_norm = [DataLoader(g, DataConfig('norm')) for g in range(6)]

svr = svm.SVR(C=200, gamma=0.001)
regr = lm.Ridge(alpha=1300.0)
lsr = lm.Lasso(alpha=0.035)
enr = lm.ElasticNet(alpha=0.45, l1_ratio=0.5)
krr = kernel_ridge.KernelRidge(kernel='polynomial')
gbr = ensemble.GradientBoostingRegressor(loss='huber',
                                         max_features='sqrt',
                                         n_estimators=400)
rfr = ensemble.RandomForestRegressor(n_estimators=90)
xgbr = xgb.XGBRegressor(booster='gbtree',
                        gamma=0.001,
                        max_depth=3,
                        min_child_weight=2,
                        n_estimators=100)
xgblr = xgb.XGBRegressor(booster='gblinear', n_estimators=300, gamma=0.0001)
lgbr = lgb.LGBMRegressor(num_leaves=3,
                         min_data_in_leaf=11,
                         max_bin=55,
                         learning_rate=0.05,
Exemplo n.º 9
0
            lowest_test_rmse_lasso = test_rmse_lasso
            bestmask_lasso = masks[i]
            bestalpha_lasso = alpha
    print('Alpha = ' + str(alpha) + ' is done!')
print('Combination is: ' + str(bestmask_lasso))
print('Lowest test RMSE with Lasso Regularizer is ' +
      str(lowest_test_rmse_lasso) + ', alpha = ' + str(bestalpha_lasso))
print('-' * 20)

lambda1_list = [1e-4, 1e-3, 1e-2]
lambda2_list = [1e-2, 1e-1, 1, 1e1, 1e2, 1e3]
lowest_test_rmse_elasticnet = 1
for lambda1 in lambda1_list:
    for lambda2 in lambda2_list:
        elasticnet = linear_model.ElasticNet(alpha=(lambda1 + lambda2),
                                             l1_ratio=lambda1 /
                                             (lambda1 + lambda2))
        for i in range(32):
            enc = pre.OneHotEncoder(categorical_features=masks[i])
            data_enc_noshuffle = enc.fit_transform(data_noshuffle)
            if True in masks[i]:
                data_enc_noshuffle = data_enc_noshuffle.toarray().astype(int)
            else:
                data_enc_noshuffle = data_enc_noshuffle.astype(int)
            data_enc_noshuffle_folds = kfold(data_enc_noshuffle)
            _, test_rmse_elasticnet = cv(data_enc_noshuffle_folds,
                                         target_noshuffle_folds, elasticnet)
            if test_rmse_elasticnet < lowest_test_rmse_elasticnet:
                lowest_test_rmse_elasticnet = test_rmse_elasticnet
                bestmask_elasticnet = masks[i]
                bestlambda1 = lambda1
Exemplo n.º 10
0

print("=====================================")
print("== Scaler + Elastic-net regression ==")
print("=====================================")

alphas = [.0001, .001, .01, .1, 1, 10, 100, 1000] 
l1_ratio = [.1, .5, .9]

print("----------------------------")
print("-- Parallelize outer loop --")
print("----------------------------")

enet = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('enet', lm.ElasticNet(max_iter=10000)),
])
param_grid = {'enet__alpha':alphas ,
              'enet__l1_ratio':l1_ratio}
enet_cv = GridSearchCV(enet, cv=5,  param_grid=param_grid)
%time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5, n_jobs=-1)
print("Test r2:%.2f" % scores.mean())

print("-----------------------------------------------")
print("-- Parallelize outer loop + built-in CV      --")
print("-- Remark: scaler is only done on outer loop --")
print("-----------------------------------------------")

enet_cv = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('enet', lm.ElasticNetCV(max_iter=10000, l1_ratio=l1_ratio, alphas=alphas)),
Exemplo n.º 11
0
def createHealthForecasterModels():
    import pickle
    ## Aggregate relevant data for ML:
    data = createDataTable()
    fixedFactors = [
        'Age', 'Sex', 'Urban', 'ENT', 'OBGYN', 'Old_age_midLife_syndrome',
        'alcohol_poisoning', 'dermatological', 'digestive', 'endocrine',
        'heart', 'hematological', 'infectious_parasitic', 'injury',
        'muscular_rheumatological', 'neurological', 'noDiagnosis', 'noReport',
        'other', 'pyschiatric', 'respiratory', 'sexualDysfunction', 'tumor',
        'unknown', 'urinary', 'High_BP', 'Diabetes', 'Heart_attack',
        'Internal_bleeding', 'Pregnant', 'Height'
    ]
    fixedFactorIdxs = [
        list(data.columns).index(varName) for varName in fixedFactors
    ]

    lifestyleFactors = [
        'Smoker', 'Cups_water_daily', 'Alcohol_frequency', 'Weight', 'Kcal',
        'Carbs', 'Fat', 'Protein', 'Activity_level', 'Daily_screen_time',
        'Hours_of_sleep'
    ]
    lifestyleFactorIdxs = [
        list(data.columns).index(varName) for varName in lifestyleFactors
    ]
    responseVariables = [
        'Insulin', 'Triglycerides', 'HDL_C', 'LDL_C', 'Urea', 'Uric_acid',
        'APO_A', 'Lipoprotein_A', 'High_sensitivity_CRP', 'Creatinine',
        'APO_B', 'Mg', 'Ferritin', 'Hemoglobin', 'White_blood_cell',
        'Red_blood_cell', 'Platelet', 'Glucose_field', 'HbA1c',
        'Total_protein', 'Albumin', 'Glucose', 'Total_cholestorol',
        'Alanine_AT', 'Transferrin', 'Transferrin_receptor', 'Systol',
        'Diastol'
    ]

    responseVariableIdxs = [
        list(data.columns).index(varName) for varName in responseVariables
    ]

    fatRelatedIdxs = [
        responseVariables.index('APO_A'),
        responseVariables.index('Lipoprotein_A'),
        responseVariables.index('HDL_C'),
        responseVariables.index('LDL_C'),
        responseVariables.index('APO_B'),
        responseVariables.index('Triglycerides'),
        responseVariables.index('Total_cholestorol')
    ]
    gluRelatedIdxs = [
        responseVariables.index('Insulin'),
        responseVariables.index('HbA1c'),
        responseVariables.index('Glucose')
    ]

    inputFeatures = fixedFactors + lifestyleFactors
    X = data[inputFeatures].to_numpy()
    Y = data[responseVariables].to_numpy()

    # Y_zscore = (Y-np.mean(Y,axis=0))/np.std(Y,axis=0)

    # X_Train, X_Test, Y_Train, Y_Test, cv = shuffleAndSplit(X, Y, test_size=.2, n_splits=5)
    # X_Train, X_Test, Y_Train_zscore, Y_Test_zscore, cv = shuffleAndSplit(X, Y_zscore, test_size=.2, n_splits=5)

    ## Create a second model to predict weight:

    # fixedFactors2 = ['age', 'sex', 'urban', 'ENT', 'OBGYN', 'Old_age_midLife_syndrome', 'alcohol_poisoning',
    #                 'dermatological', 'digestive', 'endocrine', 'heart', 'hematological', 'infectious_parasitic', 'injury',
    #                 'muscular_rheumatological', 'neurological', 'noDiagnosis', 'noReport', 'other', 'pyschiatric', 'respiratory',
    #                 'sexualDysfunction', 'tumor', 'unknown', 'urinary', 'highBP', 'diabetes', 'heart_attack', 'internal_bleeding',
    #                 'pregnant','height']
    # fixedFactorIdxs2 = [list(data.columns).index(varName) for varName in fixedFactors]

    # lifestyleFactors2 = ['smoker', 'cups_water_daily', 'Alcohol_frequency', 'kcal', 'carbo', 'fat', 'protn', 'Activity_level', 'Daily_screen_time', 'Hours_of_sleep']
    # lifestyleFactorIdxs2 = [list(data.columns).index(varName) for varName in lifestyleFactors]
    # responseVariables2 = ['weight']
    # responseVariableIdxs2 = [list(data.columns).index(varName) for varName in responseVariables2]

    # inputFeatures2 = fixedFactors2+lifestyleFactors2

    # X2 = data[fixedFactors2 + lifestyleFactors2].to_numpy()
    # Y2 = data[responseVariables2].to_numpy()

    # X_Train2, X_Test2, Y_Train2, Y_Test2, cv = shuffleAndSplit(X2, Y2, test_size=.2, n_splits=5)

    models = dict(
        ols=linear_model.LinearRegression(),
        lasso=linear_model.Lasso(alpha=0.75),
        ridge=linear_model.Ridge(alpha=0.75),
        elastic=linear_model.ElasticNet(alpha=0.1, l1_ratio=0.75),
        randomForest=ensemble.RandomForestRegressor(
            random_state=0,
            max_features='auto',
            min_samples_leaf=50,  #max_depth = 3,
            n_estimators=200))

    # Also define models to predict z_score Target Matrix
    # models_zscore = dict(ols=linear_model.LinearRegression(),
    #              lasso=linear_model.Lasso(alpha=.5),
    #              ridge=linear_model.Ridge(alpha=.5),
    #              elastic=linear_model.ElasticNet(alpha=.5, l1_ratio=0.5),
    #              randomForest = ensemble.RandomForestRegressor(random_state=0,
    #                                                           max_features = 'auto',
    #                                                           min_samples_leaf = 10,
    #                                                           n_estimators = 200)

    # weightModel = dict(ols=linear_model.LinearRegression(),
    #             lasso=linear_model.Lasso(alpha=.5),
    #             ridge=linear_model.Ridge(alpha=.5),
    #             elastic=linear_model.ElasticNet(alpha=.5, l1_ratio=0.5),
    #             randomForest = ensemble.RandomForestRegressor(random_state=0,
    #                                                            max_features = 'auto',
    #                                                            min_samples_leaf = 10,
    #                                                            n_estimators = 200))
    # print('Training trainedWeightBPModels')
    # trainedWeightModels = {}
    # for name, mdl in weightModel.items():
    #     print('Training ' + str(name) + '...')
    #     trainedWeightModels.update({name : mdl.fit(X2,Y2.ravel())})
    # print('finished')

    # Train models
    print('Training trainedModels')
    trainedModels = {}
    for name, mdl in models.items():
        print('Training ' + str(name) + '...')
        trainedModels.update({name: mdl.fit(X, Y)})
    print('finished')
    # pickle.dump([trainedModels, trainedWeightModels, inputFeatures, responseVariables, inputFeatures2, responseVariables2], open("models.p", "wb"))
    pickle.dump([trainedModels, inputFeatures, responseVariables, data],
                open("models.p", "wb"))

    # return trainedModels, trainedWeightModels, inputFeatures, responseVariables, inputFeatures2, responseVariables2
    return trainedModels, inputFeatures, responseVariables
Exemplo n.º 12
0
dict_log_alpha["svm"] = 1e-4
dict_log_alpha["svr"] = np.array([1e-2, 1e-2])
# Set models to be tested
models = {}
models["lasso"] = Lasso(estimator=None)
models["enet"] = ElasticNet(estimator=None)
models["wLasso"] = WeightedLasso(estimator=None)
models["logreg"] = SparseLogreg(estimator=None)
models["svm"] = SVM(estimator=None)
models["svr"] = SVR(estimator=None)

custom_models = {}
custom_models["lasso"] = Lasso(estimator=celer.Lasso(
    warm_start=True, fit_intercept=False))
custom_models["enet"] = ElasticNet(
    estimator=linear_model.ElasticNet(warm_start=True, fit_intercept=False))
custom_models["logreg"] = SparseLogreg(
    estimator=celer.LogisticRegression(warm_start=True, fit_intercept=False))

# Compute "ground truth" with cvxpylayer
dict_cvxpy_func = {
    'lasso': lasso_cvxpy,
    'enet': enet_cvxpy,
    'wLasso': weighted_lasso_cvxpy,
    'logreg': logreg_cvxpy,
    'svm': svm_cvxpy,
    'svr': svr_cvxpy
    }

dict_vals_cvxpy = {}
dict_grads_cvxpy = {}
Exemplo n.º 13
0
    ycols = ['PRICE']
    xcols = list(set(df.columns) - set(ycols))
    X = df.loc[:, xcols].values
    y = np.ravel(df.loc[:, ycols].values)

    # specify cross-validation
    k = 10  # number of folds
    cvsplitter = ms.KFold(n_splits=k, shuffle=True,
                          random_state=0)  # cross-validation splitter

    # specify all models
    models = list()
    models.append(('LR', sl.LinearRegression()))
    models.append(('RIDGE', sl.Ridge()))
    models.append(('LASSO', sl.Lasso()))
    models.append(('EN', sl.ElasticNet()))
    models.append(('KNN', neighbors.KNeighborsRegressor()))
    models.append(('CART', tree.DecisionTreeRegressor()))
    models.append(('SVM', svm.SVR()))

    # fit and compute scores
    scoring = 'neg_mean_squared_error'
    algs = list()
    scores = list()
    for entry in models:
        score = -1 * ms.cross_val_score(
            entry[1], X, y, cv=cvsplitter, scoring=scoring)
        scores.append(score)
        algs.append(entry[0])
        #print('{0} - {1:.4f} - {2:.4f}'.format(entry[0], np.mean(score), np.std(score, ddof = 1)))
Exemplo n.º 14
0
def get_model_obj(modelType, n_clusters=None, **kwargs):
    if modelType == 'knn':
        from sklearn.neighbors import KNeighborsClassifier
        # 6 seems to give the best trade-off between accuracy and precision
        knn = KNeighborsClassifier(n_neighbors=6, **kwargs)
        return knn
    elif modelType == 'gaussianNB':
        from sklearn.naive_bayes import GaussianNB
        gnb = GaussianNB(**kwargs)
        return gnb

    elif modelType == 'multinomialNB':
        from sklearn.naive_bayes import MultinomialNB
        # TODO: figure out how to configure binomial distribution
        mnb = MultinomialNB(**kwargs)
        return mnb

    elif modelType == 'bernoulliNB':
        from sklearn.naive_bayes import BernoulliNB
        bnb = BernoulliNB(**kwargs)
        return bnb

    elif modelType == 'randomForest':
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier(random_state=234, **kwargs)
        return rfc

    elif modelType == 'svm':
        from sklearn.svm import SVC
        svc = SVC(random_state=0, probability=True, **kwargs)
        return svc

    elif modelType == 'LinearRegression':
        #assert column, "Column name required for building a linear model"
        #assert dataframe[column].shape == target.shape
        from sklearn import linear_model
        l_reg = linear_model.LinearRegression(**kwargs)
        return l_reg

    elif modelType == 'RidgeRegression':
        from sklearn.linear_model import Ridge
        if not kwargs:
            kwargs = {'alpha': 0.5}
        ridge_reg = Ridge(**kwargs)
        return ridge_reg

    elif modelType == 'RidgeRegressionCV':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alphas': [0.1, 1.0, 10.0]}
        ridge_cv_reg = linear_model.RidgeCV(**kwargs)
        return ridge_cv_reg

    elif modelType == 'LassoRegression':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1}
        lasso_reg = linear_model.Lasso(**kwargs)
        return lasso_reg

    elif modelType == 'ElasticNetRegression':
        from sklearn.metrics import r2_score
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1, 'l1_ratio': 0.7}
        enet_reg = linear_model.ElasticNet(**kwargs)
        return enet_reg

    elif modelType == 'LogisticRegression':
        from sklearn.linear_model import LogisticRegression
        log_reg = LogisticRegression(random_state=123, **kwargs)
        return log_reg

    elif modelType == 'RANSACRegression':
        from sklearn.linear_model import LinearRegression, RANSACRegressor
        ransac_model = RANSACRegressor(LinearRegression())
        return ransac_model

    elif modelType == 'kde':
        from sklearn.neighbors.kde import KernelDensity
        kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs)
        return kde

    elif modelType == 'AR':
        import statsmodels.api as sm
        # fit an AR model and forecast
        ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9,
                                             method='mle',
                                             disp=-1,
                                             **kwargs)
        #ts_forecast = ar_fitted.predict(start='2008', end='2050')
        return ar_fitted

    elif modelType == 'SARIMAX':
        mod = sm.tsa.statespace.SARIMAX(df.riders,
                                        trend='n',
                                        order=(0, 1, 0),
                                        seasonal_order=(1, 1, 1, 12),
                                        **kwargs)
        return mod

    elif modelType == 'sgd':
        # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html
        from sklearn.linear_model import SGDClassifier
        sgd = SGDClassifier(**kwargs)
        return sgd

    elif modelType == 'perceptron':
        from sklearn.linear_model import Perceptron
        perceptron = Perceptron(**kwargs)
        return perceptron

    elif modelType == 'xgboost':
        import xgboost as xgb
        xgbm = xgb.XGBClassifier(**kwargs)
        return xgbm

    elif modelType == 'baseNN':
        from keras.models import Sequential
        from keras.layers import Dense
        # create model
        model = Sequential()
        assert args.get('inputParams', None)
        assert args.get('outputParams', None)
        model.add(Dense(inputParams))
        model.add(Dense(outputParams))
        if args.get('compileParams'):
            # Compile model
            model.compile(
                compileParams
            )  # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    elif modelType == 'lightGBMRegression':
        from pylightgbm.models import GBMRegressor
        lgbm_lreg = GBMRegressor(num_iterations=100,
                                 early_stopping_round=10,
                                 num_leaves=10,
                                 min_data_in_leaf=10)
        return lgbm_lreg

    elif modelType == 'lightGBMBinaryClass':
        from pylightgbm.models import GBMClassifier
        lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1)
        return lgbm_bc

    # Clustering models
    elif modelType == 'KMeans':
        assert n_clusters, "Number of clusters argument mandatory"
        cluster_callable = KMeans
        # seed of 10 for reproducibility.
        clusterer = cluster_callable(n_clusters=n_clusters, random_state=10)
        return clusterer

    elif modelType == 'dbscan':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        cluster_callable = DBSCAN
        clusterer = cluster_callable(eps=0.5)
        return clusterer

    elif modelType == 'affinity_prop':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = AffinityPropagation(damping=.9, preference=-200)
        return clusterer
    elif modelType == 'spectral':
        assert n_clusters, "Number of clusters argument mandatory"
        clusterer = SpectralClustering(n_clusters=n_clusters,
                                       eigen_solver='arpack',
                                       affinity="nearest_neighbors")
        return clusterer
    elif modelType == 'birch':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = Birch(n_clusters=2)
        return clusterer

    elif modelType == 'agglomerativeCluster':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(dataframe,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        clusterer = AgglomerativeClustering(n_clusters=cluster,
                                            linkage='ward',
                                            connectivity=connectivity)
        return clusterer

    elif modelType == 'meanShift':
        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3)
        clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        return clusterer

    elif modelType == 'gmm':
        from sklearn import mixture
        gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
        return gmm

    elif modelType == 'dgmm':
        from sklearn import mixture
        dgmm = mixture.BayesianGaussianMixture(n_components=5,
                                               covariance_type='full')
        return dgmm

    else:
        raise 'Unknown model type: see utils.py for available'
Exemplo n.º 15
0
        # SVM
        regression(svm.SVR(kernel="rbf")),
        regression(svm.NuSVR(kernel="rbf")),
        classification_binary(svm.SVC(kernel="rbf", **SVC_PARAMS)),
        classification_binary(svm.SVC(kernel="linear", **SVC_PARAMS)),
        classification_binary(svm.SVC(kernel="poly", degree=2, **SVC_PARAMS)),
        classification_binary(svm.SVC(kernel="sigmoid", **SVC_PARAMS)),
        classification_binary(svm.NuSVC(kernel="rbf", **SVC_PARAMS)),
        classification(svm.SVC(kernel="rbf", **SVC_PARAMS)),
        classification(svm.NuSVC(kernel="rbf", **SVC_PARAMS)),

        # Linear Regression
        regression(linear_model.LinearRegression()),
        regression(linear_model.HuberRegressor()),
        regression(linear_model.ElasticNet(random_state=RANDOM_SEED)),
        regression(linear_model.ElasticNetCV(random_state=RANDOM_SEED)),
        regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)),
        regression(linear_model.Lars()),
        regression(linear_model.LarsCV()),
        regression(linear_model.Lasso(random_state=RANDOM_SEED)),
        regression(linear_model.LassoCV(random_state=RANDOM_SEED)),
        regression(linear_model.LassoLars()),
        regression(linear_model.LassoLarsCV()),
        regression(linear_model.LassoLarsIC()),
        regression(linear_model.OrthogonalMatchingPursuit()),
        regression(linear_model.OrthogonalMatchingPursuitCV()),
        regression(linear_model.Ridge(random_state=RANDOM_SEED)),
        regression(linear_model.RidgeCV()),
        regression(linear_model.BayesianRidge()),
        regression(linear_model.ARDRegression()),
Exemplo n.º 16
0
# In[89]:

# ridge regression,
rreg = linear_model.Ridge()
rreg.fit(X, Y)
Y_p_r = rreg.predict(X_t)
print("Ridge Regression")
print("Mean squared error: %.2f" % mean_squared_error(Y_t, Y_p_r))
print("Mean absolute error: %.2f" % mean_absolute_error(Y_t, Y_p_r))

# In[90]:

# lasso regression
lareg = linear_model.Lasso()
lareg.fit(X, Y)
Y_p_l = lareg.predict(X_t)
print("Lasso Regression")
print("Mean squared error: %.2f" % mean_squared_error(Y_t, Y_p_l))
print("Mean absolute error: %.2f" % mean_absolute_error(Y_t, Y_p_l))

# In[93]:

# elasticnet regression
enreg = linear_model.ElasticNet()
enreg.fit(X, Y)
Y_p_en = enreg.predict(X_t)
print("ElasticNet Regression")
print("Mean squared error: %.2f" % mean_squared_error(Y_t, Y_p_en))
print("Mean absolute error: %.2f" % mean_absolute_error(Y_t, Y_p_en))
Exemplo n.º 17
0
        train(algos, X_train, y_train)
        errorList = errors(algos, X_test, y_test)
        g = 0
        for algo in algos:
            results[g, b] = errorList[algo]
            g = g + 1
        b = b + 1
    finalResult = {}
    g = 0
    for algo in algos:
        finalResult[algo] = np.mean(results[g, :])
        g = g + 1
    return finalResult

algos = {
    'elasticNet': linear_model.ElasticNet(alpha=0.9, l1_ratio=.1)
}

data = pullAndMerge('SP500', 'TWEXB', 'CPIHOSNS', 'A191RL1Q225SBEA', 'PSAVERT', 'T10YFF', 'DGS10', 'BAMLH0A0HYM2', 'T10Y2Y', 'FEDFUNDS', 'USROA', 'USROE', 'USSTHPI', 'STLFSI', 'NCBCMDPMVCE', 'MPRIME', 'CILACBQ158SBOG', 'INTDSRUSM193N', 'TERMCBAUTO48NS', 'TOTLL', 'BAMLH0A0HYM2EY', 'BAMLC0A0CM', 'RU3000VTR', 'TOTBKCR')

print(data.head())

dates = data['DATE']
num = data['SP500']

volatility = getFinalVolt(data['SP500'], regDepth=200, smoothDepth=200)[400:]

indicators = data.drop(['DATE', 'SP500'], axis=1).as_matrix()[400:]

X, y = setLag(indicators, volatility, lag=90)
Exemplo n.º 18
0
    async def ensemble_score(self, target, model_type, websocket, app_id, user_id, exclude_list=[]):
        await websocket.send_text(json.dumps({"type": "message", "data": "Importing different models", "percent": 5 }))
        all_estimators = {
            'categorical': {
          
                "gradient_boost_c": {
                    "name": "Gradient Boosting Classifier",
                    "model": ensemble.GradientBoostingClassifier(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "rf_c": {
                    "name": "RandomForest Classifier",
                    "model": ensemble.RandomForestClassifier(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "dt_c": {
                    "name": "Decision Tree Classifier",
                    "model": tree.DecisionTreeClassifier(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "ada_c": {
                    "name": "Ada Boosting",
                    "model": ensemble.AdaBoostClassifier(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "lda_c": {
                    "name": "Linear Discriminant Analysis",
                    "model": discriminant_analysis.LinearDiscriminantAnalysis(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "ridge_c": {
                    "name": "Ridge Classifier",
                    "model": linear_model.RidgeClassifier(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "logistic_c": {
                    "name": "Logistic Regression",
                    "model": linear_model.LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "knn_c": {
                    "name": "K Neighbours Classifier",
                    "model": neighbors.KNeighborsClassifier(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                },
                "qda_c": {
                    "name": "Quadratic Discriminant Analysis",
                    "model": discriminant_analysis.QuadraticDiscriminantAnalysis(),
                    "cost_fn": ['Accuracy', 'AUC', 'Recall', 'Precision', 'Cohen Kappa']
                }

            },
            'numerical': {
                "linear_r": {
                    "name": "Linear Regression",
                    "model": linear_model.LinearRegression(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "lasso_r": {
                    "name": "Lasso Regression",
                    "model": linear_model.Lasso(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "ridge_r": {
                    "name": "Ridge Regression",
                    "model": linear_model.Ridge(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "enet_r": {
                    "name": "Elastic Net",
                    "model": linear_model.ElasticNet(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "least_angle_r": {
                    "name": "Least Angle Regression",
                    "model": linear_model.Lars(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "lasso_least_angle_r": {
                    "name": "Lasso Least Angle Regression",
                    "model": linear_model.LassoLars(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
            
                "gradient_boost_r": {
                    "name": "Gradient Boosting Regression",
                    "model": ensemble.GradientBoostingRegressor(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "rf_r": {
                    "name": "RandomForest Regression",
                    "model": ensemble.RandomForestRegressor(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "dt_r": {
                    "name": "Decision Tree Regression",
                    "model": tree.DecisionTreeRegressor(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "ada_r": {
                    "name": "Ada Boosting Regression",
                    "model": ensemble.AdaBoostRegressor(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
        
                "svm_linear_r": {
                    "name": "SVM - Linear Kernel",
                    "model": svm.SVR(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "knn_r": {
                    "name": "K Neighbours Classifier",
                    "model": neighbors.KNeighborsRegressor(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                },
                "naive_r": {
                    "name": "Naive Bayes",
                    "model": linear_model.BayesianRidge(),
                    "cost_fn": ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']
                }
            }
        }

        await websocket.send_text(json.dumps({"type": "message", "data": model_type, "percent": 6 }))
        
        exclude_list.append(target)
        await websocket.send_text(json.dumps({"type": "message", "data": 'Selecting Target', "percent": 7 }, cls=NpEncoder)) 

        for col in self.columns:
            await websocket.send_text(json.dumps({"type": "message", "data": 'Detecting dates..', "percent": 8 }, cls=NpEncoder)) 
            if self[col].dtype == 'object':
                try:
                    self[col] = pd.to_datetime(self[col])
                    await websocket.send_text(json.dumps({"type": "message", "data": 'Convert objects to dates..', "percent": 9 }, cls=NpEncoder)) 
                except ValueError:
                    pass


        # Exclude all columns which are given in list
        X = self[self.columns.difference(exclude_list)]
        features = X.columns        

        await websocket.send_text(json.dumps({"type": "message", "data": 'Selecting Features', "percent": 10  }, cls=NpEncoder))

        if model_type=='categorical':
            y = self[target].astype('int')
            estimators = all_estimators[model_type]
            await websocket.send_text(json.dumps({"type": "message", "data": 'Classification models initiating', "percent": 11  })) 

        else:
            y = self[target].astype('float')
            estimators = all_estimators[model_type]
            await websocket.send_text(json.dumps({"type": "message", "data": 'Regression models initiating', "percent": 11 })) 

        counter = 10
        # skf = model_selection.StratifiedKFold(n_splits=2, random_state=None)
        await websocket.send_text(json.dumps({"type": "message", "data": 'Cross validation initiated', "percent": counter })) 

        model_summary = []
        for index, model in enumerate(estimators.keys()):

            logger.info(estimators[model]['name'] + ' estimator initiating')
            score_auc = np.empty((0,0))
            score_recall = np.empty((0,0))
            score_acc = np.empty((0,0))
            score_precision = np.empty((0,0))
            score_kappa = np.empty((0,0))
            score_f1 = np.empty((0,0))
            score_mcc = np.empty((0,0))
            score_mae = np.empty((0,0))
            score_mse = np.empty((0,0))
            score_rmse = np.empty((0,0))
            score_r2 = np.empty((0,0))
            score_rmsle = np.empty((0,0))
            score_mape = np.empty((0,0))
     

            start = time.time()
            # # for train_index, test_index in skf.split(X, y):
            counter = counter + index
            # X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            # y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=.3) 

            await websocket.send_text(json.dumps({"type": "message", "data":  estimators[model]['name'] + ' estimator initiating', "percent": counter })) 
            clf = estimators[model]['model']
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            logger.info(estimators[model]['name'] + ' estimator completed')

            if model_type == 'categorical':

                # score_auc = np.append(score_auc, metrics.roc_auc_score(y_pred, y_test, multi_class="ovo"))
                score_acc = np.append(score_acc, metrics.accuracy_score(y_pred, y_test))
                score_recall = np.append(score_recall, metrics.recall_score(y_pred, y_test, average=None))
                score_precision = np.append(score_precision, metrics.precision_score(y_pred, y_test, average=None))
                score_kappa = np.append(score_kappa, metrics.cohen_kappa_score(y_pred, y_test))
                score_mcc = np.append(score_mcc, metrics.matthews_corrcoef(y_pred, y_test))
                score_f1 = np.append(score_f1, metrics.f1_score(y_pred, y_test, average='micro'))

            if model_type == 'numerical':
                score_acc = np.append(score_acc, metrics.explained_variance_score(y_test, y_pred))
                score_mae = np.append(score_mae, metrics.mean_absolute_error(y_test, y_pred))
                score_mse = np.append(score_mse, metrics.mean_squared_error(y_test, y_pred))
                score_rmse = np.append(score_rmse, metrics.mean_squared_error(y_test, y_pred, squared=False))
                score_r2 = np.append(score_r2, metrics.r2_score(y_test, y_pred))
                score_rmsle = np.append(score_rmsle, np.sqrt(np.mean(
                    np.square(np.log1p(y_test.ravel() - y_test.ravel().min() + 1) - np.log1p(y_pred.ravel() - y_pred.ravel().min() + 1)))))
                # print("RMSE", y_test)
                # print("RMSE", y_test.shape)
                # print("RMSE2", y_pred.shape)
                # print("Predicted", y_pred)
            
            
            await websocket.send_text(json.dumps({"type": "message", "data": estimators[model]['name'] + ' accuracy', "percent": counter })) 
            end = time.time()

            summary = {}
            for col in features:
                summary[col] = {
                    "min": X[col].min(),
                    "max": X[col].max(),
                    "default": X[col].iloc[0],
                    "dtype": str(X[col].dtype)
                }


            if model_type == 'categorical':
                temp = {
                    "name": estimators[model]['name'],
                    "type": model_type,
                    "scores": score_acc,
                    "accuracy": score_acc.mean(),
                    "time_elasped": (end - start),
                    # "auc": score_auc.mean(),
                    'f1': score_f1.mean(),
                    "recall": score_recall.mean(),
                    "precision": score_precision.mean(),
                    "kappa": score_kappa.mean(),
                    "features": summary,
                    "target": target
                }
                
                # app_id, model_id, name, score)
                await websocket.send_text(json.dumps({"type": "message", "data": estimators[model]['name'] + ' saving into cloud', "percent": counter }, cls=NpEncoder)) 
                model_id, model_path = await self.save_model_to_s3(clf, app_id, user_id, estimators[model]['name'], score_acc.mean(), json.dumps(temp, cls=NpEncoder))
                temp['model_id'] = model_id
                model_summary.append(temp)

            else:
                temp = {
                    "name": estimators[model]['name'],
                    "type": model_type,
                    "scores": score_acc,
                    "accuracy": score_acc.mean(),
                    "time_elasped": (end - start),
                    "mae": score_mae.mean(),
                    "mse": score_mse.mean(),
                    "rmse": score_rmse.mean(),
                    "rmsle": score_rmsle.mean(),
                    "r2": score_r2.mean(),
                    "features": summary,
                    "target": target
                }

                await websocket.send_text(json.dumps({"type": "message", "data": estimators[model]['name'] + ' saving into cloud', "percent": counter }, cls=NpEncoder)) 
                model_id, model_path = await self.save_model_to_s3(clf, app_id, user_id, estimators[model]['name'], score_acc.mean(), json.dumps(temp, cls=NpEncoder))
                temp['model_id'] = model_id
                model_summary.append(temp)
        return model_summary
# Excluding outliers from data
train = train[train['visitors_pool_total'] < 3500]


# Extract validation data from training data
def create_validation_data(training_data):
    train_validation, test_validation = train_test_split(training_data,
                                                         test_size=0.3)
    return train_validation, test_validation


# Creating elastic net object (alpha = 0.5, lambda = 0.8,  max_iter=1000000,
# tol=0.0000001) and predictors
model = linear_model.ElasticNet(alpha=0.5,
                                l1_ratio=0.8,
                                fit_intercept=True,
                                normalize=False,
                                max_iter=1000000,
                                tol=0.0000001)
predictors = list(train.columns.values)
predictors.remove('date')
predictors.remove('visitors_pool_total')

# Calculating error (RMSE) for the model
rmse_group = list()
model_counter = 0
for iter in xrange(CROSS_VALIDATION_ITER):
    model_counter += 1
    train_validation, test_validation = create_validation_data(train)
    model.fit(train_validation[predictors],
              train_validation['visitors_pool_total'])
image_paths = lookup.values()
mr = get_images_df(file_paths=image_paths, mask=standard_mask)
image_ids = [int(os.path.basename(x).split(".")[0]) for x in image_paths]
mr.index = image_ids

# what we can do is generate a predicted image for a particular set of concepts (e.g, for a left out image) by simply multiplying the concept vector by the regression parameters at each voxel.  then you can do the mitchell trick of asking whether you can accurately classify two left-out images by matching them with the two predicted images.

regression_params = pandas.DataFrame(0, index=mr.columns, columns=concepts)

print "Training voxels..."
for voxel in mr.columns:
    train = mr.index
    Y = mr.loc[train, voxel].tolist()
    Xtrain = X.loc[train, :]
    # Use regularized regression
    clf = linear_model.ElasticNet(alpha=0.1)
    clf.fit(Xtrain, Y)
    regression_params.loc[voxel, :] = clf.coef_.tolist()

regression_params.to_csv(output_file, sep="\t")

# GENERATE BRAIN IMAGES FOR REGRESSION PARAMS
image_folder = "%s/regression_param_images" % (update)
if not os.path.exists(image_folder):
    os.mkdir(image_folder)

for concept in regression_params.columns.tolist():
    data = regression_params[concept].tolist()
    empty_nii = numpy.zeros(standard_mask.shape)
    empty_nii[standard_mask.get_data() != 0] = data
    nii = nibabel.Nifti1Image(empty_nii, affine=standard_mask.get_affine())
Exemplo n.º 21
0
def elasticRegression(train, trainLable, testData):
    clf = linear_model.ElasticNet(alpha=0.6, l1_ratio=0.5)
    clf.fit(train, trainLable)
    predict = clf.predict(testData)
    return predict
    y_train,
    cv=10,
    scoring=metrics.make_scorer(rmse))
crossValidate.get('fit_time').mean()
crossValidate.get('score_time').mean()
crossValidate.get('test_score').mean()
crossValidate.get('train_score').mean()

#Calculate the Sum of the Squared(Error) values of the weights, called L2.
#It shrinks the parameters, therefore it is mostly used to prevent multicollinearity.
#It reduces the model complexity by coefficient shrinkage.
ridge_estimator = linear_model.Ridge()
ridge_grid = {'alpha': [1.0], 'max_iter': [50]}
ridgeModel = fit_model(ridge_estimator, ridge_grid, X_train1, y_train)
coef = ridgeModel.coef_
intercept = ridgeModel.intercept_

#LASSO: Least Absolute Shrinkage Selector Operator
#Calculate the sum of the Absolute values of the weights, called L1.
#Nullifies low important features with 0 co-efficient
#It is generally used when we have more number of features, because it automatically does feature selection.
lasso_estimator = linear_model.Lasso()
lasso_grid = {'alpha': [0.5, 0.9]}
lassoModel = fit_model(lasso_estimator, lasso_grid, X_train1, y_train)
coef = lassoModel.coef_
intercept = lassoModel.intercept_

#ElasticNet is a comibination of Ridge & LASSO
enet_estimator = linear_model.ElasticNet()
enet_grid = {'alpha': [0.1], 'l1_ratio': [0.3]}
fit_model(enet_estimator, enet_grid, X_train1, y_train)
Exemplo n.º 23
0
# TODO: sample weights: 'class_weight' and 'sample_weight' can be used
''' RUNNING OPTIONS FOR MODELS '''
visualizeCGI = True
numAutoLabel = 0  # worse than label spreading?
folds = 5
scaler = preprocessing.StandardScaler(copy=False)
# scaler = preprocessing.MinMaxScaler(copy=False)

regressors = [
    dummy.DummyRegressor(),
    svm.LinearSVR(),
    svm.SVR(kernel='rbf', gamma='scale'),
    linear_model.Ridge(),
    linear_model.Lasso(),
    linear_model.ElasticNet(),
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor(),
    ensemble.AdaBoostRegressor()
]

classifiers = [
    dummy.DummyClassifier(),
    svm.LinearSVC(),
    svm.SVC(kernel='rbf', gamma='scale'),
    neighbors.KNeighborsClassifier(),
    ensemble.RandomForestClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.AdaBoostClassifier()
]
Exemplo n.º 24
0
def elastic_net(data, y, x_val, y_val, i):
    from sklearn import linear_model
    reg = linear_model.ElasticNet(random_state = i)
    reg.fit(data, y)
    y_predict = reg.predict(x_val)
    return reg.score(x_val, y_val)
Exemplo n.º 25
0
noise_sd = 10
X, y, coef = datasets.make_regression(n_samples=50,
                                      n_features=100,
                                      noise=noise_sd,
                                      n_informative=2,
                                      random_state=42,
                                      coef=True)

# Use this to tune the noise parameter such that snr < 5
print("SNR:", np.std(np.dot(X, coef)) / noise_sd)

# param grid over alpha & l1_ratio
param_grid = {'alpha': 10.**np.arange(-3, 3), 'l1_ratio': [.1, .5, .9]}

# Warp
model = GridSearchCV(lm.ElasticNet(max_iter=10000), param_grid, cv=5)

# 1) Biased usage: fit on all data, ommit outer CV loop
model.fit(X, y)
print("Train r2:%.2f" % metrics.r2_score(y, model.predict(X)))
print(model.best_params_)

# 2) User made outer CV, useful to extract specific information
cv = KFold(len(y), n_folds=5, random_state=42)
y_test_pred = np.zeros(len(y))
y_train_pred = np.zeros(len(y))
alphas = list()

for train, test in cv:
    X_train, X_test, y_train, y_test = X[train, :], X[
        test, :], y[train], y[test]
Exemplo n.º 26
0
        format='pkl')
    fifth_features = ngrams.load_data(
        '../datasets/grams_dict2002-2016/5grams_feature_relative.pkl',
        format='pkl')

    # List of models to run
    model_zoo = Counter()
    # model_zoo['OLS'] = linear_model.LinearRegression()
    model_zoo['OLS'] = linear_model.RidgeCV(
        np.array([10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]))
    model_zoo['PLS'] = cross_decomposition.PLSRegression(n_components=200)
    model_zoo['RF-30'] = RandomForestRegressor(max_features="sqrt",
                                               n_estimators=30)
    model_zoo['RF-log50'] = RandomForestRegressor(max_features="log2",
                                                  n_estimators=50)
    model_zoo['Elastic Net'] = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7)

    # choosing number of grams
    features_list = [
        uni_feature, bi_features, tri_features, forth_features, fifth_features
    ]

    # process and stack ngram features, make it numpy array if PLS will be used
    ngrams.stack_processed_features(train_data,
                                    test_data,
                                    judge_year_index,
                                    features_list,
                                    to_array=True)

    # Run various models
youngsters = np.where(y_t1 <= AGE)
y_old = np.delete(y_t1, youngsters)
data_old = np.delete(X, youngsters, axis=0)
data_old = np.delete(data_old, np.where(areas_notdeg2 == 1), axis=1)

adults = np.where(y_t1 > AGE)
y_young = np.delete(y_t1, adults)
data_young = np.delete(X, adults, axis=0)
data_young = np.delete(data_young, np.where(areas_notdeg2 == 1), axis=1)

#prepare a range of parameters to test
alphas = np.array([
    1,
])
l1_ratio = np.array([0.9])
model = linear_model.ElasticNet(
)  #We have chosen to just normalize the data by default, you could GridsearchCV this is you wanted
grid = GridSearchCV(estimator=model,
                    param_grid=dict(alpha=alphas, l1_ratio=l1_ratio))
grid.fit(data_old, y_old)

# we want 30 features
grid.best_estimator_.alpha = 0.01
grid.best_estimator_.l1_ratio = .9999
lm = linear_model.ElasticNet(alpha=grid.best_estimator_.alpha,
                             l1_ratio=grid.best_estimator_.l1_ratio)
error_old_tot = np.empty((data_old.shape[1], NUM_REPS))
for idx in range(NUM_REPS):
    X_train, X_test, y_train, y_test_old = train_test_split(data_old, y_old)
    lm.fit(X_train, y_train)
    error_old_tot[:, idx] = lm.coef_
mean_pred_old = np.nanmean(error_old_tot, axis=1)
Exemplo n.º 28
0
def calculate_spot_features(spot_id, all_input, all_params):
    bm_data, meta_data, db_path, NN_OUTPUT, scramble_seed = all_input
    alpha, radius, l1_ratio = all_params
    
    conn = sqlite3.connect(db_path)
    conn.text_factory = str

    c = conn.cursor()
    sel_cmd = ('SELECT {coi1} FROM {tn1} WHERE {coi2}=={0}').format(spot_id, 
           coi1='cell_id',coi2='spot_id', tn1='cells')
    
    c.execute(sel_cmd)
    results = c.fetchall()
    cell_id =  np.asarray([results[i][0] for i in xrange(len(results))])
    spot_bm = bm_data[cell_id - 1, :] # biomarker values of the spot            
    spot_meta = meta_data[cell_id -1, :] # x,y coordinates of each cell
    
    sel_cmd = ('SELECT {coi1} FROM {tn1} WHERE {coi2}=={0}').format(spot_id, 
           coi1='spot_name',coi2='spot_id', tn1='spots')
    
    c.execute(sel_cmd)
    results = c.fetchall()
    spot_name =  results[0][0]
    
    # get nearest neighbors
    spot_xy = spot_meta[:,:2]
    tree = KDTree(spot_xy)
    ind, dist = tree.query_radius(spot_xy, r = radius, return_distance = True)
    
    # if scramble
    if scramble_seed:
        np.random.seed(scramble_seed)
        spot_bm = spot_bm[np.random.permutation(np.arange(len(cell_id))),:]
        output_name = os.path.join(NN_OUTPUT,spot_name +  '_seed_' + str(scramble_seed) + '.mat')
    else:
        output_name = os.path.join(NN_OUTPUT,spot_name + '.mat')
    if os.path.isfile(output_name):
        return 1 
    #num_non_zeros_nn = [] # number of contributing neighbors
    entropies = [] # entropy
    angle_std = [] # std
    neighbor_id = [] # id of neighbor
    residuals = [] # residuals at each cells
    coeff_neighbors = [] # coefficient of neighbor each cell
    pmf_neighbors = []
    
    for j in xrange(spot_bm.shape[0]):
        clf = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        
        cell_bm = spot_bm[j,:].reshape(1,-1) # cell of question
        cell_xy = spot_xy[j,:]
        
        nb_bm = spot_bm[ind[j][dist[j]!=0], :] # eliminate itself
        nb_xy = spot_xy[ind[j][dist[j]!=0], :]
        
        curr_neigh_ind =   ind[j][dist[j]!=0]
        neighbor_id.append(curr_neigh_ind)      

        if np.min(nb_bm.shape) > 0:
            clf.fit(nb_bm.T,cell_bm.T)
            pred = clf.predict(nb_bm.T).reshape(1,-1)
            resid_biomarkers = np.square(cell_bm - pred)
            residuals.append(np.sqrt(resid_biomarkers.sum())/cell_bm.shape[1])
            coeff_neighbors.append(clf.coef_)
            
            nb_diff_xy = nb_xy - cell_xy
            nb_diff_xy = nb_diff_xy/np.linalg.norm(nb_diff_xy,axis = 1).reshape(-1,1) # normalize the vector
            
            if len(clf.coef_) >= 2: # and (np.linalg.norm(clf.coef_) > 0):
                nb_angles = [0]
                for k in xrange(len(clf.coef_)-1):
                    a = np.dot(nb_diff_xy[0,:],nb_diff_xy[k+1,:])
                    a = np.sign(a)*np.minimum(1,np.abs(a))
                    nb_angles.append(math.acos(a))
                nb_angles = [(nb_angles[i] + np.pi/100) % 2*np.pi for i in xrange(len(nb_angles))]
                bin_vec = np.linspace(0, 2*np.pi, num=36)
                nb_bin = [(bin_vec -nb_angles[i] >=0).nonzero()[0][0] for i in range(len(nb_angles))]
                pk = np.zeros(36)
                for i in xrange(len(nb_angles)):
                    pk[nb_bin[i]] = pk[nb_bin[i]] + np.abs(clf.coef_[i])
                
                pmf_neighbors.append(np.array(pk))
                cell_entropy = scipy.stats.entropy(pk + 1e-6)
                if np.isinf(cell_entropy):
                    mat_dict={'passed': 0, 'spot_meta':spot_meta,'entropies':entropies,
                            'residuals':residuals,'angle_std':angle_std,'pmf':pmf_neighbors,
                            'coeff_neigh':coeff_neighbors, 'neighbor_id':neighbor_id
                           }
                    scipy.io.savemat(os.path.join(NN_OUTPUT,spot_name + '.mat'), mat_dict)                    
                    return 0
                entropies.append(cell_entropy)
                coef_ = np.abs(clf.coef_)/np.sum(np.abs(clf.coef_))
                cos_mean_resultant = np.sum([coef_[i]*math.cos(nb_angles[i]) for i in range(len(coef_))])
                sin_mean_resultant = np.sum([coef_[i]*math.sin(nb_angles[i]) for i in range(len(coef_))])
                mean_resultant = np.sqrt(cos_mean_resultant**2+sin_mean_resultant**2)
                cell_angle_std = np.sqrt(2*(1-mean_resultant))
                if np.isnan(cell_angle_std):
                    mat_dict={'passed': 0, 'spot_meta':spot_meta,'entropies':entropies,
                            'residuals':residuals,'angle_std':angle_std,'pmf':pmf_neighbors,
                            'coeff_neigh':coeff_neighbors, 'neighbor_id':neighbor_id
                           }
                    scipy.io.savemat(output_name, mat_dict)                    
                    return 0
                angle_std.append(cell_angle_std)
            else:
                pmf_neighbors.append([0])
                entropies.append(0)
                angle_std.append(0)
        else:
            #print spot_name, j, nb_bm.shape                   
            residuals.append(0)
            entropies.append(0)
            angle_std.append(0)
            pmf_neighbors.append([])
            coeff_neighbors.append([])
    
    mat_dict={'passed': 1, 'spot_meta':spot_meta,'entropies':entropies,
              'residuals':residuals,'angle_std':angle_std,'pmf':pmf_neighbors,
              'coeff_neigh':coeff_neighbors, 'neighbor_id':neighbor_id}
    scipy.io.savemat(output_name, mat_dict) 
    
    return 1
Exemplo n.º 29
0
    def __init__(
        self,
        method,
        yrange,
        params,
        i=0
    ):  #TODO: yrange doesn't currently do anything. Remove or do something with it!
        self.algorithm_list = [
            'PLS',
            'GP',
            'OLS',
            'OMP',
            'Lasso',
            'Elastic Net',
            'Ridge',
            'Bayesian Ridge',
            'ARD',
            'LARS',
            'LASSO LARS',
            'SVR',
            'KRR',
        ]
        self.method = method
        self.outliers = None
        self.ransac = False

        print(params)
        if self.method[i] == 'PLS':
            self.model = PLSRegression(**params[i])

        if self.method[i] == 'OLS':
            self.model = linear.LinearRegression(**params[i])

        if self.method[i] == 'OMP':
            # check whether to do CV or not
            self.do_cv = params[i]['CV']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove CV parameter
            params_temp.pop('CV')
            if self.do_cv is False:
                self.model = linear.OrthogonalMatchingPursuit(**params_temp)
            else:
                if 'precompute' in params[i]:
                    params_temp.pop('precompute')
                self.model = linear.OrthogonalMatchingPursuitCV(**params_temp)

        if self.method[i] == 'LASSO':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # check whether to do CV or not
            try:
                self.do_cv = params[i]['CV']
                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lasso(**params_temp)
            else:
                params_temp.pop('alpha')
                self.model = linear.LassoCV(**params_temp)

        if self.method[i] == 'Elastic Net':
            params_temp = copy.copy(params[i])
            try:
                self.do_cv = params[i]['CV']
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.ElasticNet(**params_temp)
            else:
                params_temp['l1_ratio'] = [.1, .5, .7, .9, .95, .99, 1]
                self.model = linear.ElasticNetCV(**params_temp)

        if self.method[i] == 'Ridge':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv:
                self.model = linear.RidgeCV(**params_temp)
            else:
                self.model = linear.Ridge(**params_temp)

        if self.method[i] == 'BRR':
            self.model = linear.BayesianRidge(**params[i])

        if self.method[i] == 'ARD':
            self.model = linear.ARDRegression(**params[i])

        if self.method[i] == 'LARS':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lars(**params_temp)
            else:
                self.model = linear.LarsCV(**params_temp)

        if self.method[i] == 'LASSO LARS':
            model = params[i]['model']
            params_temp = copy.copy(params[i])
            params_temp.pop('model')

            if model == 0:
                self.model = linear.LassoLars(**params_temp)
            elif model == 1:
                self.model = linear.LassoLarsCV(**params_temp)
            elif model == 2:
                self.model = linear.LassoLarsIC(**params_temp)
            else:
                print("Something went wrong, \'model\' should be 0, 1, or 2")

        if self.method[i] == 'SVR':
            self.model = svm.SVR(**params[i])

        if self.method[i] == 'KRR':
            self.model = kernel_ridge.KernelRidge(**params[i])

        if self.method[i] == 'GP':
            # get the method for dimensionality reduction and the number of components
            self.reduce_dim = params[i]['reduce_dim']
            self.n_components = params[i]['n_components']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove parameters not accepted by Gaussian Process
            params_temp.pop('reduce_dim')
            params_temp.pop('n_components')
            self.model = GaussianProcess(**params_temp)
Exemplo n.º 30
0
# In[8]:

# features_df = df[['sqft_living','bathrooms', 'sqft_living15', 'grade', 'bedrooms', 'floors', 'waterfront', \
#                   'view', 'sqft_above', 'sqft_basement', 'sqft_lot15', 'lat', 'is_renovated_in_last_10_years']]
features_df = df.drop('price', axis=1)
features_df = SelectPercentile(percentile=75).fit(
    features_df, df.price).transform(features_df)
features_df = StandardScaler().fit(features_df).transform(features_df)

# In[9]:

x_train, x_test, y_train, y_test = train_test_split(features_df, df.price)

# In[15]:

linear_regr = linear_model.ElasticNet(
    alpha=0.001, max_iter=5000)  # RandomForestRegressor(n_estimators = 75) #
model = linear_regr.fit(x_train, y_train)
predictions = model.predict(x_test)

# In[17]:

plt.scatter(y_test, predictions)
plt.rcParams["figure.figsize"] = (15, 12)
plt.show()

# In[19]:

print("Mean squared error: %.3f" % mean_squared_error(y_test, predictions))
print("Mean absolute error: %.3f" % mean_absolute_error(y_test, predictions))
print('Variance score: %.3f' % r2_score(y_test, predictions))