def calibrate(self, data, dt): ''' Parameters --------------------------- data: 1-D array historical time series equally distanced dt: float time step increment between 2 consecutive data points ''' reg = linreg() reg.fit(data[:-1].reshape(-1, 1), data[1:]) predict = [] predict.append(data[0]) for i in range(1, len(data)): predict.append(reg.coef_ * predict[i - 1] + reg.intercept_) predict = np.array(predict) ssr = ((predict - data)**2).mean() self.parameters = { 'long term mean': reg.intercept_ / dt, 'reversion speed': -np.log(reg.coef_[0]) / dt, 'volatility': ssr[0] * ((-2 * np.log(reg.coef_[0]) / dt / (1 - reg.coef_[0]**2))**.5) }
def fit_linear_model2(X, y, results, keys, num_cv = 5, verbose = False, plot_results = False ): X = pp.scale(X) clf = [] R2 = [] coef = [] prob = [] score = [] group_keys = [] # Now do cross-validation to estimate accuracy if num_cv > 1: kf = KFold(n = len(y), n_folds = num_cv) for train, test in kf: X_train, X_test, y_train, y_test, results_test, keys_test = X[train], X[test], y[train], y[test], results[test], keys[test] clf_temp2 = linreg( fit_intercept = False) clf_temp2.fit(X_train,y_train) pred = clf_temp2.predict(X_test) clf.append(clf_temp2) R2.append(clf_temp2.score(X_test,y_test)) coef.append(clf_temp2.coef_) prob.append(diff_to_prob(pred)) score.append(lossFx(results_test,pred)) group_keys.append(keys_test) else: clf_temp2 = linreg( fit_intercept = False) clf_temp2.fit(X,y) pred = clf_temp2.predict(X) clf = clf_temp2 R2 = clf_temp2.score(X,y) coef = clf_temp2.coef_ prob = diff_to_prob(pred) score = lossFx(results,pred) group_keys = keys if num_cv > 1: return clf, R2, score, coef, prob, kf, group_keys else: return clf, R2, score, coef, prob, group_keys
def run_var_list(new_vars, loansData): "run fit and predict with new variable list" train_df, train_y, test_df, test_y = load_data(loansData, new_vars) train_X, my_scaler = scale_train_data(train_df) test_X = scale_test_data(my_scaler, test_df) regr = linreg() regr.fit(train_X, train_y) sort_coefs(list(train_df.columns), regr.coef_, regr.intercept_) cross_validate(regr, train_X, train_y, cv=10, print_out=True) score = regr.score(train_X, train_y) print('Regression fit R^2 score %.4f' % score) pscore = regr.score(test_X, test_y) print('Regression predict R^2 score %.4f' % pscore)
def main(): game_data = retrieve_mysql_data() db_cursor.close() nbadb.close() sum_correct = 0 sum_total = 0 sum_correct_outcome = 0 veg_avgs = np.zeros(shape=20) avgs = np.zeros(shape=20) for i in range(0, 20): random.seed(i) np.random.shuffle(game_data) x_data = game_data[:, :-2] y_data = game_data[:, -2] vegas_data = game_data[:, -1] kf = KFold(n_splits=5, shuffle=False) for train_index, test_index in kf.split(x_data): train_x_raw, test_x_raw = x_data[train_index], x_data[test_index] train_y, test_y = y_data[train_index], y_data[test_index] X_train = standardize_add_bias(train_x_raw, train_x_raw) X_test = standardize_add_bias(test_x_raw, train_x_raw) reg = linreg().fit(X_train, train_y) y_exp = X_test.dot(reg.coef_) + reg.intercept_ for pred, act, veg in zip(np.nditer(y_exp), np.nditer(test_y), np.nditer(vegas_data)): sum_total += 1 if pred > veg and act > veg: sum_correct += 1 elif pred < veg and act < veg: sum_correct += 1 if pred > 0 and act > 0: sum_correct_outcome += 1 elif pred < 0 and act < 0: sum_correct_outcome += 1 veg_avgs[i] = np.mean(np.abs(vegas_data[test_index] - test_y)) avgs[i] = np.mean(np.abs(np.round(y_exp * 2) / 2 - test_y)) print("Average deviation from actual point spread: {:.3f}".format( np.mean(avgs))) print("Vegas deviation from actual point spread: {:.3f}".format( np.mean(veg_avgs))) print("Percentage of correct spread predictions: {}".format(sum_correct / sum_total)) print("Percentage of correct game predictions: {}".format( sum_correct_outcome / sum_total))
def test_model(X, Y): _l = linreg() _ = [] for i in range(0, 10): X_train, X_test, y_train, y_test = train_test_split( #X,Y, test_size=0.3) X, preprocessing.MinMaxScaler( (0, 1)).fit_transform(Y.reshape(-1, 1)), test_size=0.2) model = _l.fit(X_train, y_train) #lab_enc.fit_transform(Y)) predictions = model.predict(X_test) _.append(model.score(X_test, y_test)) print('Averaged model score over 10 iterations: ' + str(sum(_) / 10))
def first_reversal(SessDict): rzone_early = slice(25 - 6, 32 - 6) rzone_late = slice(35 - 6, 42 - 6) RZONE_LICKS = {} slopes = np.zeros([len(SessDict.keys()), ]) for m, (mouse, days) in enumerate(SessDict.items()): print(mouse) LR, LICKS, SPEED = [], [], [] transition_trials = [] early_rzone_licks = [] for i, day in enumerate(days[:2]): for sess_ind, session in enumerate(day): sess = TwoPUtils.sess.Session(basedir_VR=basedir_VR, mouse=mouse, date=session['date'], scene=session['scene'], session=session['session'], VR_only=True, prompt_for_keys=False) sess.align_VR_to_2P() # get LR value for each trial lr_trial = get_LR_trial(sess) # make position binned lick rates and speed sess.add_timeseries(licks=sess.vr_data['lick']._values, speed=sess.vr_data['dz']._values) sess.add_pos_binned_trial_matrix(('licks', 'speed'), 't', min_pos=6, max_pos=43, bin_size=1, mat_only=True) licks_rz_early = sess.trial_matrices['licks'][:, rzone_early].mean(axis=-1) if i == 0 and sess_ind == 0: baseline = np.mean(licks_rz_early[lr_trial == -1]) else: licks = licks_rz_early[lr_trial == -1] / baseline licks[np.isnan(licks)] = 0 early_rzone_licks.append(licks) # f, ax = plt.subplots() early_rzone_licks = np.concatenate(early_rzone_licks) lr = linreg().fit(np.arange(40)[:, np.newaxis], early_rzone_licks[:40]) slopes[m] = lr.coef_ # ax.plot(early_rzone_licks) # ax.plot(sp.ndimage.filters.gaussian_filter1d(early_rzone_licks,5)) RZONE_LICKS[mouse] = early_rzone_licks return RZONE_LICKS, slopes
def main(): "main program" loansData = read_data() numeric_vars = get_numeric_vars() train_df, train_y, test_df, test_y = load_data(loansData, numeric_vars) print("train_df head\n", train_df[:3]) print("train_y head\n", train_y[:3]) plotdir = make_plotdir() # add scaling train_X, my_scaler = scale_train_data(train_df) test_X = scale_test_data(my_scaler, test_df) regr = linreg() regr.fit(train_X, train_y) # print('regr methods', dir(regr)) # print('columns', list(train_df.columns), 'Intercept') # print('coefs', regr.coef_, regr.intercept_) coefs = sort_coefs(list(train_df.columns), regr.coef_, regr.intercept_) fitpts = regr.predict(train_X) plot_predict_scatter(plotdir, "train", fitpts, train_y) cross_validate(regr, train_X, train_y, cv=10, print_out=True) score = regr.score(train_X, train_y) print('Regression fit R^2 score %.4f' % score) pred = regr.predict(test_X) # pscore = sum(np.array(test_y) == pred) # need np.tol.diff pscore = sum(np.abs(test_y - pred)) / len(test_y) print('Regression predict diff average %.4f' % pscore) # pscore = np.sqrt(sum( (test_y - pred)*(test_y - pred) )) pscore = regr.score(test_X, test_y) print('Regression predict R^2 score %.4f' % pscore) plot_predict_scatter(plotdir, "test", pred, test_y) # try fit with fewer top variables: 5, 4, 3, 2 for top in range(5, 1, -1): new_vars = get_top_vars(coefs, top) print('new_vars', new_vars) run_var_list(new_vars, loansData)
def std2kappa(std,Kbel,Kup): """The standard deviation transformer: This function transforms a given standard deviation value to the Kappa value of the von Mises distribution. For this purpose, interpolation with linear regression for a given Kappa interval is done. If the correlation between Kappa and standard deviation is smaller than 0.99, or if the estimated Kappa is outside of the Kappa interpolation interval, exception is raised. See also kappa_investigation.py Parameters ---------- std: float. The standard deviation value which is wished to be transformed to the Kappa value. Kbel: float. The lower limit of the Kappa interpolation interval. Kup: float. The upper limit of the Kappa interpolation interval. Returns ------- model.intercept_+std*model.coef_: float. The desired Kappa value of the given standard deviation. """ x=np.linspace(-np.pi,np.pi,num=100*2*np.pi+1)#Create an array spanning from -pi to +pi with a length of 101 (0 in the middle, #50 values negative, 50 values positive) kInt=np.linspace(Kbel,Kup,2001)#The kappa interval for interpolation, spanning from Kbel and Kup, with 2001 bins total. distCom=[]#List of von Mises distributions with different Kappa values chosen from kInt, normalized by total area stdCom=[]#Standard deviation values of the distributions in distCom, calculated with the formula sqrt(sum(x**2*y)) whereas x is #the distribution variable and y is the relative density of the distribution variable. for i in range(0,len(kInt)): distCom.append(1/(2*np.pi)*np.e**(kInt[i]*np.cos(x-0))) distCom[i]=distCom[i]/sum(distCom[i]) stdCom.append(np.sqrt(sum(x**2*distCom[i]))) model=linreg().fit(np.asarray(np.rad2deg(stdCom)).reshape(-1,1),kInt)#creating the linear regression model, x value has to be transposed in advance! #np.reshape(-1,1) creates an nx1 matrix out of the array. model.score(np.asarray(np.rad2deg(stdCom)).reshape(-1,1),kInt)#Returns the coefficient of determination R^2 of the prediction. model.intercept_#the intercept (kappa for stdvM=0), this value is not to be taken seriously. model.coef_#the coefficient coefficient, by which x value decays. if model.score(np.asarray(np.rad2deg(stdCom)).reshape(-1,1),kInt)<0.99:#Make sure the correlation coefficient>0.99 raise Exception("The fit is not good enough. Correlation coefficient=%s"%(model.score(np.asarray(np.rad2deg(stdCom)).reshape(-1,1),kInt))) if (model.predict(std)>Kbel and model.predict(std)<Kup)==False:#Make sure the estimated Kappa is inside the interpolation interval. raise Exception("WARNING! The estimated Kappa is not in given interval. Interval=[%s,%s], estimated Kappa=%s"%(Kbel,Kup,model.intercept_+std*model.coef_)) return model.intercept_+std*model.coef_ #this function is useful to estimate the Kcent in colmod (colclass.py)!
### ages and net_worths need to be reshaped into 2D numpy arrays ### second argument of reshape command is a tuple of integers: (n_rows, n_columns) ### by convention, n_rows is the number of data points ### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) from sklearn.cross_validation import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like from sklearn.linear_model import LinearRegression as linreg reg=linreg() reg.fit(ages_train,net_worths_train) print ("Slope: ",reg.coef_) print ("Score: ",reg.score(ages_test,net_worths_test)) try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show()
def train(self): _l = linreg() self.model = _l.fit(self.X, self.Y)
y = data.iloc[:, -1].values #splituniversaldataset(train:test) #library:sklearn #module:model_selection #classtrain_test_split from sklearn.model_selection import train_test_split as tts x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3, random_state=3) #algorithmselection #linearregression #library:sklearn #module:linear_model #class:LinearRegression from sklearn.linear_model import LinearRegression as linreg model_linreg = linreg() #trainthemodel model_linreg.fit(x_train, y_train) #Testthemodel #predictingoutput y_pred = model_linreg.predict(x_test) #Checkingaccuracy accuracy = model_linreg.score(x_test, y_test) print('Linear regression accuracy:', accuracy) #Visualzsation #heatmap import seaborn as sb
def dff(f, trial_starts, teleports,f_neu = None, neuropil_method = None, bleedthrough_ts = None, neu_bleedthrouh_ts = None): ''' ''' f_ = np.zeros(f.shape) * np.nan # keep only the fluorescence on each trial if f_neu is not None: f_neu_ = np.zeros(f_neu.shape) * np.nan for i, (start, stop) in enumerate(zip(trial_starts.tolist(), teleports.tolist())): f_[:, start - 1:stop - 1] = f[:, start - 1:stop - 1] if f_neu is not None: f_neu_[:, start - 1:stop - 1] = f_neu[:, start - 1:stop - 1] # green channel bleedthrough correction: regress green channel from red channel # For each cell, predict red from green channel, subtract prediction to get residual, and add back in intercept; # So red signal will be residual+intercept nanmask = ~np.isnan(f_[0, :]) if bleedthrough_ts is not None: for cell in range(f_.shape[0]): lr = linreg().fit(bleedthrough_ts[cell:cell + 1, nanmask].T, f_[cell, nanmask]) # linear regression from scikitlearn f_[cell, nanmask] = f_[cell, nanmask] - lr.predict(bleedthrough_ts[cell:cell + 1, nanmask].T) + lr.intercept_ if f_neu is not None and neu_bleedthrouh_ts is not None: for cell in range(f_neu_.shape[0]): lr = linreg().fit(neu_bleedthrouh_ts[cell:cell + 1, nanmask].T, f_neu_[cell, nanmask]) f_neu_[cell, nanmask] = f_neu_[cell, nanmask] - lr.predict(neu_bleedthrouh_ts[cell:cell + 1, nanmask].T) + lr.intercept_ # once bleedthrough is subtracted, do neuropil correction on both channels if neuropil_method == 'subtract': f_ -= .7 * f_neu_ elif neuropil_method == 'regress': # F = F_ raise NotImplementedError elif neuropil_method is None: pass # Caluclate baseline for chan 1 flow = sp.ndimage.filters.gaussian_filter(f_[:, nanmask], [0., 15]) # cut out ITIs and smooth signal flow = sp.ndimage.filters.minimum_filter1d(flow, int(500 * 15)) # minimum filter, taking min val over 15 sec flow = sp.ndimage.filters.maximum_filter1d(flow, int(500 * 15)) # max filter with same window (dilation) # to get deltaF/F: subtract baseline from initial signal, divide by abs(baseline) # baseline can sometimes end up as negative due to regression # -- red dff -- dff = np.zeros(f_.shape) * np.nan dff[:, nanmask] = (f_[:, nanmask] - flow) / np.abs(flow) # Smooth the deltaF/F transients by 2 time bins for i, (start, stop) in enumerate(zip(trial_starts.tolist(), teleports.tolist())): dff[:, start - 1:stop - 1] = sp.ndimage.filters.gaussian_filter1d(dff[:, start - 1:stop - 1], 2, axis=1) return dff
def dff_dual(F_red, Fneu_red, F_green, Fneu_green, trial_starts, teleport_starts, method_red='regress', method_green='regress'): ''' calculate dF/F for two channels, red and green regress green from red inputs: F_red: ROI fluorescence for red chan Fneu_red: neuropil fluorescence for red chan F_green: ROI fluorescence for green chan Fneu_green: neuropil fluorescence for green chan trial_starts: timeseries of trial start indices teleport_starts: timeseries of teleport start indices method_red, method_green: 'regress' or 'subtract' - how to correct for neuropil outputs: dFF_red dFF_green ''' F = np.zeros(F_red.shape)*np.nan #red F2 = np.zeros(F_green.shape)*np.nan #green #keep only the fluorescence on each trial Fneu = np.zeros(F_red.shape)*np.nan Fneu2 = np.zeros(F_green.shape)*np.nan for i, (start,stop) in enumerate(zip(trial_starts.tolist(),teleport_starts.tolist())): F[:,start-1:stop-1] = F_red[:,start-1:stop-1] F2[:,start-1:stop-1] = F_green[:,start-1:stop-1] Fneu[:,start-1:stop-1] = Fneu_red[:,start-1:stop-1] Fneu2[:,start-1:stop-1] = Fneu_green[:,start-1:stop-1] # green channel bleedthrough correction: regress green channel from red channel # For each cell, predict red from green channel, subtract prediction to get residual, and add back in intercept; # So red signal will be residual+intercept nanmask = ~np.isnan(F[0,:]) F2_ = np.copy(F2) F_ = np.copy(F) for cell in range(F.shape[0]): lr = linreg().fit(F2[cell:cell+1,nanmask].T,F[cell,nanmask]) #linear regression from scikitlearn F[cell,nanmask] = F[cell,nanmask]-lr.predict(F2[cell:cell+1,nanmask].T) + lr.intercept_ F_[cell,nanmask] = F_[cell,nanmask]-lr.predict(F2[cell:cell+1,nanmask].T) - .7*(lr.predict(Fneu[cell:cell+1,nanmask].T)) + lr.intercept_ lr = linreg().fit(Fneu2[cell:cell+1,nanmask].T,Fneu[cell,nanmask]) Fneu[cell,nanmask] = Fneu[cell,nanmask]-lr.predict(Fneu2[cell:cell+1,nanmask].T) + lr.intercept_ #regress out F2 neuropil from F2 lr = linreg().fit(Fneu2[cell:cell+1,nanmask].T,F2_[cell,nanmask]) F2_[cell,nanmask] = F2_[cell,nanmask] - .7*(lr.predict(Fneu2[cell:cell+1,nanmask].T)) + lr.intercept_ # once bleedthrough is subtracted, do neuropil correction on both channels if method_red == 'subtract': F -= .7*Fneu elif method_red == 'regress': F = F_ elif method_red is None: pass if method_green == 'subtract': F2 -= .7*Fneu2 # subtraction, GRABDA elif method_green == 'regress': F2 = F2_ # regression, GRABDA elif method_green is None: pass # Caluclate baseline for chan 1 Flow = sp.ndimage.filters.gaussian_filter(F[:,nanmask], [0., 15]) #cut out ITIs and smooth signal Flow = sp.ndimage.filters.minimum_filter1d(Flow, int(500*15)) #minimum filter, taking min val over 15 sec Flow = sp.ndimage.filters.maximum_filter1d(Flow, int(500*15)) #max filter with same window (dilation) # to get deltaF/F: subtract baseline from initial signal, divide by abs(baseline) # baseline can sometimes end up as negative due to regression # -- red dff -- dFF_red = np.zeros(F_red.shape)*np.nan dFF_red[:,nanmask] = (F[:,nanmask]-Flow)/np.abs(Flow) # Calculate baseline for chan 2 Flow = sp.ndimage.filters.gaussian_filter(F2[:,nanmask], [0., 15]) Flow = sp.ndimage.filters.minimum_filter1d(Flow, int(500*15)) Flow = sp.ndimage.filters.maximum_filter1d(Flow, int(500*15)) # -- green dff -- dFF_green = np.zeros(F_green.shape)*np.nan dFF_green[:,nanmask] = (F2[:,nanmask]-Flow)/np.abs(Flow) # Smooth the deltaF/F transients by 2 time bins for i, (start,stop) in enumerate(zip(trial_starts.tolist(),teleport_starts.tolist())): dFF_red[:,start-1:stop-1] = sp.ndimage.filters.gaussian_filter1d(dFF_red[:,start-1:stop-1],2,axis=1) dFF_green[:,start-1:stop-1] = sp.ndimage.filters.gaussian_filter1d(dFF_green[:,start-1:stop-1],2,axis=1) return dFF_red, dFF_green
# Start training the three different models (with multithreading support) size_model = None message_readability_model = None message_length_model = None if ("knn" in sys.argv): size_model = KNN(n_neighbors=n_neighbors, n_jobs=8) message_readability_model = KNN(n_neighbors=n_neighbors, n_jobs=8) message_length_model = KNN(n_neighbors=n_neighbors, n_jobs=8) elif ("svc" in sys.argv or "svm" in sys.argv): size_model = SVC(C=C) message_readability_model = SVC(C=C) message_length_model = SVC(C=C) else: size_model = linreg() message_readability_model = linreg() message_length_model = linreg() size_model.fit(scaled_size_features, size_output) message_readability_model.fit(scaled_message_features, readability_output) message_length_model.fit(scaled_message_features, length_output) predicted_length = message_length_model.predict(scaled_message_features) predicted_readability = message_readability_model.predict( scaled_message_features) predicted_size = size_model.predict(scaled_size_features) # Test the models for accuracy if ("knn" in sys.argv): print("Accuracy for message length with knn k=" + str(n_neighbors) +
""" Same plot but for a smaller kappa interval (kappa=1,1.5) This plot is used in the thesis! """ kInt=np.linspace(1,1.5,2001) distCom=[] stdCom=[] for i in range(0,len(kInt)): distCom.append(1/(2*np.pi)*np.e**(kInt[i]*np.cos(x-0))) distCom[i]=distCom[i]/sum(distCom[i]) stdCom.append(np.sqrt(sum(x**2*distCom[i]))) ax2=fig.add_subplot(1,2,2) ax2.set_ylabel("kappa") ax2.set_xlabel("standard deviation") ax2.plot(np.rad2deg(stdCom),kInt,color="black") #Setting a smaller kappa interval causes the relationship to be approximately linear. By this way, linear regression between Kappa and std can #be done to find out the Kappa value of an std value which we would like to have. """ Fitting a linear regression line to std von Mises and Kappa in interval [0.5;1.5] The function is transferred to supplementary_functions.py """ model=linreg().fit(np.asarray(np.rad2deg(stdCom)).reshape(-1,1),kInt)#creating the linear regression model, x value has to be transposed in advance! model.score(np.asarray(np.rad2deg(stdCom)).reshape(-1,1),kInt)#Returns the coefficient of determination R^2 of the prediction. #0.9978434393176431 is just perfect! model.intercept_#3.5869855951879352 is the intercept (kappa for stdvM=0), dont take the value serious model.coef_#-0.03539763 is coefficient, by which x value decays. # IMPORTANT: this regression is useful if and only if kappa is between 0.5 and 1.5, as the fit is done in that interval!
]] y = a['SalePrice'] X1 = b[[ 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold' ]] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23) #scalar.fit(X_train) #X_train_scaled=scalar.transform(X_train) #X_test_scaled=scalar.transform(X_test) lr = linreg(alpha=20.0).fit(X_train, y_train) #print('Coefficient: ',lr.coef_) #print('Intercept: ',lr.intercept_) print('R-squared score(training):{:.3f}'.format(lr.score(X_train, y_train))) print('R-squared score(test):{:.3f}'.format(lr.score(X_test, y_test))) print(lr.predict(X1))
def polynomial_reg(degree=2, **kwargs): """Pipeline regression models""" return make_pipeline(polynom(degree), linreg(**kwargs))
print(pd.value_counts(titanic["Embarked"].values, sort=False)) # "S" is most common char -> chosen as default for missing values titanic["Embarked"] = titanic["Embarked"].fillna("S") #4) Replace Embarked char with numeric code #titanic.loc[titanic["Embarked"]=="S", "Embarked"]=0 # 'S' -> 0 #titanic.loc[titanic["Embarked"]=="C", "Embarked"]=1 # 'C' -> 1 titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0 titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1 titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2 # 'Q' -> 2 # input column used for predictions : predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] # Initialize the algorithm algo_linreg = linreg() # Generate cross-validation folds with random splits # return rows indices for corresponding train and set kf = KFold(titanic.shape[0], n_folds=3, random_state=1) # Make the predictions predictions = [] for train, test in kf: # Which predictors used on train fold train_predictors = (titanic[predictors].iloc[train, :]) # Target/goal used to train the algo train_target = titanic["Survived"].iloc[train] # Train the algo with the predictors and target # .fit(x input, y output)