def predict(self, train_x, train_y, test_x, parameter): # self.fit(parameter,train_x,train_y) # return self.clf.predict(train_x),self.clf.predict(test_x) self.fit(parameter, train_x, np.log1p(train_y)) train_predict = np.expm1(self.clf.predict(train_x)) test_predict = np.expm1(self.clf.predict(test_x)) return train_predict, test_predict
def _gpinv(p, k, sigma): """Inverse Generalized Pareto distribution function""" x = np.full_like(p, np.nan) if sigma <= 0: return x ok = (p > 0) & (p < 1) if np.all(ok): if np.abs(k) < np.finfo(float).eps: x = - np.log1p(-p) else: x = np.expm1(-k * np.log1p(-p)) / k x *= sigma else: if np.abs(k) < np.finfo(float).eps: x[ok] = - np.log1p(-p[ok]) else: x[ok] = np.expm1(-k * np.log1p(-p[ok])) / k x *= sigma x[p == 0] = 0 if k >= 0: x[p == 1] = np.inf else: x[p == 1] = - sigma / k return x
def hyperbolic_ratio(a, b, sa, sb): ''' Return ratio of hyperbolic functions to allow extreme variations of arguments. Parameters ---------- a, b : array-like arguments vectors of the same size sa, sb : scalar integers defining the hyperbolic function used, i.e., f(x,1)=cosh(x), f(x,-1)=sinh(x) Returns ------- r : ndarray f(a,sa)/f(b,sb), ratio of hyperbolic functions of same size as a and b Examples -------- >>> x = [-2,0,2] >>> hyperbolic_ratio(x,1,1,1) # gives r=cosh(x)/cosh(1) array([ 2.438107 , 0.64805427, 2.438107 ]) >>> hyperbolic_ratio(x,1,1,-1) # gives r=cosh(x)/sinh(1) array([ 3.20132052, 0.85091813, 3.20132052]) >>> hyperbolic_ratio(x,1,-1,1) # gives r=sinh(x)/cosh(1) array([-2.35040239, 0. , 2.35040239]) >>> hyperbolic_ratio(x,1,-1,-1) # gives r=sinh(x)/sinh(1) array([-3.08616127, 0. , 3.08616127]) >>> hyperbolic_ratio(1,x,1,1) # gives r=cosh(1)/cosh(x) array([ 0.41015427, 1.54308063, 0.41015427]) >>> hyperbolic_ratio(1,x,1,-1) # gives r=cosh(1)/sinh(x) array([-0.42545906, inf, 0.42545906]) >>> hyperbolic_ratio(1,x,-1,1) # gives r=sinh(1)/cosh(x) array([ 0.3123711 , 1.17520119, 0.3123711 ]) >>> hyperbolic_ratio(1,x,-1,-1) # gives r=sinh(1)/sinh(x) array([-0.32402714, inf, 0.32402714]) See also -------- tran ''' ak, bk, sak, sbk = np.atleast_1d(a, b, np.sign(sa), np.sign(sb)) # old call #return exp(ak-bk)*(1+sak*exp(-2*ak))/(1+sbk*exp(-2*bk)) # TODO: Does not always handle division by zero correctly signRatio = np.where(sak * ak < 0, sak, 1) signRatio = np.where(sbk * bk < 0, sbk * signRatio, signRatio) bk = np.abs(bk) ak = np.abs(ak) num = np.where(sak < 0, expm1(-2 * ak), 1 + exp(-2 * ak)) den = np.where(sbk < 0, expm1(-2 * bk), 1 + exp(-2 * bk)) iden = np.ones(den.shape) * inf ind = np.flatnonzero(den != 0) iden.flat[ind] = 1.0 / den[ind] val = np.where(num == den, 1, num * iden) return signRatio * exp(ak - bk) * val #((sak+exp(-2*ak))/(sbk+exp(-2*bk)))
def numpy_sweep(start_frequency=20.0, stop_frequency=20000.0, phase=0.0, interval=(0, 1.0), sampling_rate=48000.0, length=2 ** 16): """A pure NumPy implementation of the ExponentialSweep for benchmarking. See the ExponentialSweep class for documentation of the parameters. """ # allocate shared memory for the channels array = sharedctypes.RawArray(ctypes.c_double, length) channels = numpy.frombuffer(array, dtype=numpy.float64).reshape((1, length)) # generate the sweep start, stop = sumpf_internal.index(interval, length) sweep_offset = float(start / sampling_rate) sweep_duration = (stop - start) / sampling_rate frequency_ratio = stop_frequency / start_frequency l = sweep_duration / math.log(frequency_ratio) a = 2.0 * math.pi * start_frequency * l t = numpy.linspace(-sweep_offset, (length - 1) / sampling_rate - sweep_offset, length) array = t array /= l numpy.expm1(array, out=array) array *= a array += phase numpy.sin(array, out=channels[0, :]) # fake store some additional values, because these values are actually stored in the constructor of the sweep _ = start_frequency * frequency_ratio ** (-sweep_offset / sweep_duration) # noqa: F841 _ = start_frequency * frequency_ratio ** ((sweep_duration - sweep_offset) / sweep_duration) # noqa: F841 return sumpf.Signal(channels=channels, sampling_rate=sampling_rate, offset=0, labels=("Sweep",))
def myRMSPE_xg(yhat,y): y = np.expm1(y.get_label()) yhat = np.expm1(yhat) r=myRMSPE(yhat,y) return "rmspe", r
def updateParams(self): self.pop.sort(key=op.attrgetter('f')) self.pSigma = np.dot(1.0 - self.cSigma, self.pSigma) + np.dot( np.sqrt(self.cSigma * (2.0 - self.cSigma) * self.muEff), sum(np.dot(self.rankWeight[i], self.pop[i].z) for i in range(self.popsize))) rate = np.linalg.norm(self.pSigma) / self.expectationChiDistribution if rate >= 1.0 : wsum = 0 for i in range(self.popsize): self.weight[i] = self.hatWeight[i] * np.expm1(self.alpha * np.linalg.norm(self.pop[i].z) + 1.0) wsum += self.weight[i] for i in range(self.popsize): self.weight[i] = self.weight[i] / wsum - 1.0 / self.popsize else: self.weight = self.rankWeight if rate >= 1.0: self.etaB = self.etaBMove self.etaSigma = self.etaSigmaMove elif rate >= 0.1: self.etaB = self.etaBStag self.etaSigma = self.etaSigmaStag else: self.etaB = self.etaBConv self.etaSigma = self.etaSigmaConv GDelta = sum(np.dot(self.weight[i], self.pop[i].z) for i in range(self.popsize)) GMu = sum(self.weight[i] * (np.outer(self.pop[i].z, self.pop[i].z) - np.eye(self.dim)) for i in range(self.popsize)) GSigma = np.trace(GMu) / self.dim GB = GMu - GSigma * np.eye(self.dim) self.mu += self.etaMu * self.sigma * np.dot(self.B, GDelta) self.sigma *= (np.expm1(0.5 * self.etaSigma * GSigma) + 1.0) self.B = np.dot(self.B, linalg.expm3(0.5 * self.etaB * GB))
def Ridge_model(train_linear, test_linear): ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(train_linear_fea, train_linear_tar) ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) #ridge.set_params(alpha=6,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl') return test_prediction_ridge
def _frank(M, N, alpha): if(N<2): raise ValueError('Dimensionality Argument [N] must be an integer >= 2') elif(N==2): u1 = uniform.rvs(size=M) p = uniform.rvs(size=M) if abs(alpha) > math.log(sys.float_info.max): u2 = (u1 < 0).astype(int) + np.sign(alpha)*u1 # u1 or 1-u1 elif abs(alpha) > math.sqrt(np.spacing(1)): u2 = -1*np.log((np.exp(-alpha*u1)*(1-p)/p + np.exp(-alpha))/(1 + np.exp(-alpha*u1)*(1-p)/p))/alpha else: u2 = p U = np.column_stack((u1,u2)) else: # Algorithm 1 described in both the SAS Copula Procedure, as well as the # paper: "High Dimensional Archimedean Copula Generation Algorithm" if(alpha<=0): raise ValueError('For N>=3, alpha >0 in Frank Copula') U = np.empty((M,N)) for ii in range(0,M): p = -1.0*np.expm1(-1*alpha) if(p==1): # boundary case protection p = 1 - np.spacing(1) v = logser.rvs(p, size=1) # sample N independent uniform random variables x_i = uniform.rvs(size=N) t = -1*np.log(x_i)/v U[ii,:] = -1.0*np.log1p( np.exp(-t)*np.expm1(-1.0*alpha))/alpha return U
def __init__(self, daily_returns, benchmark_daily_returns, risk_free_rate, days, period=DAILY): assert(len(daily_returns) == len(benchmark_daily_returns)) self._portfolio = daily_returns self._benchmark = benchmark_daily_returns self._risk_free_rate = risk_free_rate self._annual_factor = _annual_factor(period) self._daily_risk_free_rate = self._risk_free_rate / self._annual_factor self._alpha = None self._beta = None self._sharpe = None self._return = np.expm1(np.log1p(self._portfolio).sum()) self._annual_return = (1 + self._return) ** (365 / days) - 1 self._benchmark_return = np.expm1(np.log1p(self._benchmark).sum()) self._benchmark_annual_return = (1 + self._benchmark_return) ** (365 / days) - 1 self._max_drawdown = None self._volatility = None self._annual_volatility = None self._benchmark_volatility = None self._benchmark_annual_volatility = None self._information_ratio = None self._sortino = None self._tracking_error = None self._annual_tracking_error = None self._downside_risk = None self._annual_downside_risk = None self._calmar = None self._avg_excess_return = None
def rmspe_xg(y_hat, y): y = np.expm1(y.get_label()) w = ToWeight(y) y_hat = np.expm1(y_hat) score = np.sqrt(np.mean(((y - y_hat) * w) ** 2)) return "rmspe", score
def test_write_subregion_to_file( self, machine_timestep, dt, size_in, tau_ref, tau_rc, size_out, probe_spikes, vertex_slice, vertex_neurons ): # Check that the region is correctly written to file region = lif.SystemRegion(size_in, size_out, machine_timestep, tau_ref, tau_rc, dt, probe_spikes) # Create the file fp = tempfile.TemporaryFile() # Write to it region.write_subregion_to_file(fp, vertex_slice) # Read back and check that the values are sane fp.seek(0) values = fp.read() assert len(values) == region.sizeof() (n_in, n_out, n_n, m_t, t_ref, dt_over_t_rc, rec_spikes, i_dims) = struct.unpack_from("<8I", values) assert n_in == size_in assert n_out == size_out assert n_n == vertex_neurons assert m_t == machine_timestep assert t_ref == int(tau_ref // dt) assert ( tp.value_to_fix(-np.expm1(-dt / tau_rc)) * 0.9 < dt_over_t_rc < tp.value_to_fix(-np.expm1(-dt / tau_rc)) * 1.1 ) assert (probe_spikes and rec_spikes != 0) or (not probe_spikes and rec_spikes == 0) assert i_dims == 1
def predict(self, train_x, train_y, test_x, parameter, times=5, validation_indexs=None, type='regression'): print parameter['model'] + " predict staring" train_preds = np.zeros((times, len(train_x))) test_preds = np.zeros((times, len(test_x))) for time in xrange(times): validation_indexs = genIndexKFold(train_x, 10) test_pred = np.zeros((len(validation_indexs), len(test_x))) train_pred = np.zeros((len(train_x))) for i, (train_ind, test_ind) in enumerate(validation_indexs): clf = model_select(parameter) print "Fold", i X_train = train_x[train_ind] Y_train = np.log1p(train_y[train_ind]) X_test = train_x[test_ind] Y_test = train_y[test_ind] clf.fit(X_train, Y_train) test_pred[i][:] = np.expm1(clf.predict(test_x)) train_pred[test_ind] = np.expm1(clf.predict(X_test)) print evaluation_functions.evaluate_function(Y_test, train_pred[test_ind], 'rmsle') train_preds[time] = train_pred test_preds[time] = np.mean(test_pred, axis=0) return np.mean(train_preds, axis=0), np.mean(test_preds, axis=0)
def expm1(a, b): print((numba.typeof(a))) print((numba.typeof(np.expm1(a)))) # result = a**2 + b**2 # print "... :)" # print np.expm1(result), "..." return np.expm1(a**2) + b
def inverse_transform(self, X): if self.columns: for column in self.columns: X[column] = np.expm1(X[column]) return X else: return np.expm1(X)
def output(modelObj): test = modelObj.test file = modelObj.outputFile model = modelObj.model # remove id column test['label'] = test['label'].astype(int) week10 = test[test['Semana']==10] week11 = test[test['Semana']==11] week10['pred'] = np.expm1(model.predict(week10.values[:,:-1])) file.write('id,Demanda_uni_equil\n') temp = week10[['label', 'pred']] temp.to_csv(file, index=False, delimiter=',', header=False) ''' week10['Semana'] = week10['Semana'] + 1 week10 = week10[['Cliente_ID', 'Producto_ID', 'Semana', 'pred']] week10 = week10.groupby(by=['Cliente_ID', 'Producto_ID', 'Semana'], as_index=False).mean() week11 = pd.merge(week11, week10, on=['Cliente_ID', 'Producto_ID', 'Semana'], how='left') week11['l1'] = week11['pred'] del week11['pred'] temp = week11[['l1','l2','l3','l4','l5']] temp = temp.fillna(0) week11['lagVar'] = np.var(temp, axis=1) week11['newProduct'] = np.sum(temp, axis=1) == 0 week11['newProduct'].replace(False, 0, inplace=True) week11['newProduct'].replace(True, 1, inplace=True) ''' #week11['lagSum'] = week11['l1'] + week11['l2'] + week11['l3'] + week11['l4'] + week11['l5'] #week11['lagAvg'] = week11['lagSum'] / 5 week11['pred'] = np.expm1(model.predict(week11.values[:,:-1])) temp = week11[['label', 'pred']] temp.to_csv(file, index=False, delimiter=',', header=False) file.flush() return test.shape[0]
def predict(self, train_x, train_y, test_x, parameter, times=1, validation_indexs=None, type='regression'): print parameter['model'] + " predict staring" train_preds = np.zeros((times, len(train_x))) test_preds = np.zeros((times, len(test_x))) for time in xrange(times): logging.info("time {}".format(str(time))) validation_indexs = genIndexKFold(train_x, 5) test_pred = np.zeros((len(validation_indexs), len(test_x))) train_pred = np.zeros((len(train_x))) for i, (train_ind, test_ind) in enumerate(validation_indexs): clf = model_select(parameter) logging.info("start time:{} Fold:{}".format(str(time), str(i))) print "start time:{} Fold:{}".format(str(time), str(i)) X_train = train_x[train_ind] Y_train = np.log1p(train_y[train_ind]) X_test = train_x[test_ind] Y_test = train_y[test_ind] clf.fit(X_train, Y_train) test_pred[i][:] = np.expm1(clf.predict(test_x)) train_pred[test_ind] = np.expm1(clf.predict(X_test)) evaluation = evaluate_function( Y_test, train_pred[test_ind], 'rmsle') logging.info("time:{} Fold:{} evaluation:{}".format( str(time), str(i), str(evaluation))) train_preds[time] = train_pred test_preds[time] = np.mean(test_pred, axis=0) print train_preds, test_preds return np.mean(train_preds, axis=0), np.mean(test_preds, axis=0)
def predict(self,trains_x,train_y,tests_x,parameters,times=10,isFile=True,foldername="blend-dir"): """ Ensamble many features and regression :params train_X: dictionary for training :params train_y: testing vector """ #parameter_get test_data_sample = tests_x.values()[0] if not os.path.exists(foldername): os.makedirs(foldername) skf = None kfold_file = foldername + "/kfold_index.pkl" if os.path.exists(kfold_file): skf = pickle.load(open(kfold_file,"r")) else: skf = KFold(n=len(train_y),n_folds=times,shuffle=True) pickle.dump(skf,open(kfold_file,"w")) blend_train = np.zeros((len(train_y),len(parameters))) blend_test = np.zeros((len(test_data_sample),len(parameters))) for j,parameter in enumerate(parameters): train_x = trains_x[parameter['data']] test_x = tests_x[parameter['data']] blend_test_tmp = np.zeros((len(test_data_sample),len(parameters))) #file path check for i, (train_index,valid_index) in enumerate(skf): clf = model_select(parameter['parameter']) train = train_x[train_index] train_valid_y = train_y[train_index] kfold_filepath = "./" + foldername + "/parameter_{}_kfold_{}.pkl".format(j,i) if os.path.exists(kfold_filepath): blend_train_prediction,blend_test_prediction = pickle.load(open(kfold_filepath,"r")) blend_train[train_index,j] = np.expm1(clf.predict(train)) blend_test_tmp[:,i] = np.expm1(clf.predict(test_x)) else: clf.fit(train,np.log1p(train_valid_y)) blend_train_prediction = np.expm1(clf.predict(train)) blend_test_prediction = np.expm1(clf.predict(test_x)) pickle.dump((blend_train_prediction,blend_test_prediction),open(kfold_filepath,"w")) blend_train[train_index,j] = blend_train_prediction blend_test_tmp[:,i] = blend_test_prediction blend_test[:,j] = blend_test_tmp.mean(1) #Blending Model bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True) bclf.fit(blend_train, train_y) y_test_predict = bclf.predict(blend_test) return y_test_predict
def testBijectiveAndFinite(self): bijector = tfb.Weibull(scale=20., concentration=2., validate_args=True) x = np.linspace(1., 8., num=10).astype(np.float32) y = np.linspace( -np.expm1(-1 / 400.), -np.expm1(-16), num=10).astype(np.float32) bijector_test_util.assert_bijective_and_finite( bijector, x, y, eval_func=self.evaluate, event_ndims=0, rtol=1e-3)
def merge_predict(model1, model2, test_data): # Combine the predictions of two separately trained models. # The input models are in the log domain and returns the predictions # in original domain (expm1). p1 = np.expm1(model1.predict(test_data)) p2 = np.expm1(model2.predict(test_data)) p_total = (p1+p2) return(p_total)
def test_lasagne_regression(self): x, y = self.make_data_set() print len(x), y neural_network = mlc.model.LasagneNeuralNetwork.NeuralNetwork( problem_type="regression", batch_size=100, epochs=1000, layer_number=[100, 100, 100], dropout_layer=[0.0, 0.0, 0.0]) neural_network.fit(x, np.log1p(y), valid=True, evaluate_function="mean_squared_loss") print np.expm1(neural_network.predict(x))
def process_xgb(): col, train, test, test_ref = load_data() print(train.shape, test.shape, test_ref.shape) params = { 'colsample_bytree': 0.055, 'colsample_bylevel': 0.4, 'gamma': 1.5, 'learning_rate': 0.01, 'max_depth': 5, 'objective': 'reg:linear', 'booster': 'gbtree', 'min_child_weight': 10, 'n_estimators': 1800, 'reg_alpha': 0, 'reg_lambda': 0, 'eval_metric': 'rmse', 'subsample': 0.7, 'silent': True, 'seed': 7, } folds = 20 full_score = 0.0 xg_test = xgb.DMatrix(test[col]) use_regressor = True use_regressor = False for fold in range(folds): x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p(train.target.values), test_size=0.0010, random_state=fold) if use_regressor: p = params model = xgb.XGBRegressor(colsample_bytree=p['colsample_bytree'], colsample_bylevel=p['colsample_bylevel'], gamma=p['gamma'], learning_rate=p['learning_rate'], max_depth=p['max_depth'], objective=p['objective'], booster=p['booster'], min_child_weight=p['min_child_weight'], n_estimators=p['n_estimators'], reg_alpha=p['reg_alpha'], reg_lambda=p['reg_lambda'], eval_metric=p['eval_metric'] , subsample=p['subsample'], silent=1, n_jobs = -1, early_stopping_rounds = 100, random_state=7, nthread=-1) model.fit(x1, y1) score = np.sqrt(mean_squared_error(y2, model.predict(x2))) test['target'] += np.expm1(model.predict(test[col])) else: xg_valid = xgb.DMatrix(x2, label=y2) xg_train = xgb.DMatrix(x1, label=y1) model = xgb.train(params, xg_train, params['n_estimators']) score = np.sqrt(mean_squared_error(y2, model.predict(xg_valid))) test['target'] += np.expm1(model.predict(xg_test)) print('Fold', fold, 'Score', score) full_score += score full_score /= folds print('Full score', full_score) test['target'] /= folds test.loc[test_ref.target > 0, 'target'] = test_ref[test_ref.target > 0].target.values test[['ID', 'target']].to_csv('subxgb.csv', index=False) explain=False #explain=True if explain and not use_regressor: print(eli5.format_as_text(eli5.explain_weights(model, top=200)))
def expm1(x): """ Calculate exp(x) - 1 """ if isinstance(x, UncertainFunction): mcpts = np.expm1(x._mcpts) return UncertainFunction(mcpts) else: return np.expm1(x)
def testBijectiveAndFinite(self): with self.cached_session(): bijector = Weibull( scale=20., concentration=2., validate_args=True) x = np.linspace(1., 8., num=10).astype(np.float32) y = np.linspace( -np.expm1(-1 / 400.), -np.expm1(-16), num=10).astype(np.float32) assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
def root_mean_squared_percetage_error(y_true, y_pred): """ Mean squared error regression loss """ y_true = np.expm1(y_true) y_pred = np.expm1(y_pred) w = ToWeight(y_true) output_errors = np.mean(((y_true - y_pred) * w) ** 2) return float(np.sqrt(output_errors))
def predictsale(request): if 'dt' in request.POST: path1 = default_storage.open('mysite\\train.csv') path2 = default_storage.open('mysite\\test.csv') train_data = pd.read_csv(path1, parse_dates=[0]) test_data = pd.read_csv(path2, parse_dates=[0]) dt = request.POST['dt'] d = parse(dt) test_data['day'] = d.day test_data['month'] = d.month test_data['year'] = d.year test_data['hour'] = d.hour test_data['season'] = int(request.POST['season']) test_data['temp'] = float(request.POST['temp']) test_data['atemp'] = float(request.POST['atemp']) test_data['humidity'] = int(request.POST['humidity']) test_data['windspeed'] = float(request.POST['windspeed']) weather_condition = request.POST['weather'] if weather_condition=='Clear' or weather_condition=='Partly Cloudy' or weather_condition=='Very Hot': test_data['weather'] = 1 if weather_condition=='Mostly Cloudy' or weather_condition=='Cloudy' or weather_condition=='Hazy' or weather_condition=='Chance of Showers' or weather_condition=='Chance of Rain' or weather_condition=='Chance of Showers' : test_data['weather'] = 2 if weather_condition=='Very Cold' or weather_condition=='Showers' or weather_condition=='Rain' or weather_condition=='Chance of a Thunderstorm' or weather_condition=='Flurries' or weather_condition=='Chance of Snow Showers' or weather_condition=='Snow Showers' or weather_condition=='Chance of Snow': test_data['weather'] = 3 if weather_condition=='Foggy' or weather_condition=='Blowing Snow' or weather_condition=='Thunderstorm' or weather_condition=='Snow' or weather_condition=='Ice Pellets' or weather_condition=='Chance of Ice Pellets' or weather_condition=='Blizzard': test_data['weather'] = 4 dt_train = pd.DatetimeIndex(train_data['datetime']) train_data['year'] = dt_train.year train_data['month']= dt_train.month train_data['hour'] = dt_train.hour train_data['day'] = dt_train.day for colum in ['casual', 'registered', 'count']: train_data['log-' + colum] = train_data[colum].apply(lambda x: np.log1p(x)) attrib = ['year','month', 'day', 'hour','season', 'weather','temp', 'atemp', 'humidity', 'windspeed'] gbr = ensemble.GradientBoostingRegressor(n_estimators=80, learning_rate = .05, max_depth = 10,min_samples_leaf = 20) casual_pred= gbr.fit(train_data[attrib].values, train_data['log-casual'].values) registered_pred= gbr.fit(train_data[attrib].values, train_data['log-registered'].values) total = np.expm1(casual_pred.predict(test_data[attrib])) + np.expm1(registered_pred.predict(test_data[attrib])) print("sale :",total) return render(request, 'predictsale.html', {'total_sale': int(total[0]), 'date': dt}) else: return render(request, 'predictsale.html')
def xgboost_validset_submission(): params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 10, "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1, "seed": 1301 } num_boost_round = 300 # need to split for a small validation set X_train_xgb, X_valid_xgb = train_test_split(train, test_size=0.012) y_train_xgb = np.log1p(X_train_xgb.Sales) y_valid_xgb = np.log1p(X_valid_xgb.Sales) dtrain = xgb.DMatrix(X_train_xgb[feature_names], y_train_xgb) dvalid = xgb.DMatrix(X_valid_xgb[feature_names], y_valid_xgb) watchlist = [(dvalid, 'eval'), (dtrain, 'train')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, feval=c.rmspe_xg, verbose_eval=True) print("Validating") y_pred = gbm.predict(xgb.DMatrix(X_valid_xgb[feature_names])) error = c.rmspe(X_valid_xgb.Sales.values, np.expm1(y_pred)) print('RMSPE: {:.6f}'.format(error)) print("Make predictions on the test set") dtest = xgb.DMatrix(test[feature_names]) test_probs = gbm.predict(dtest) # Make Submission result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)}) result.to_csv("xgboost_10_submission.csv", index=False) # XGB feature importances # Based on https://www.kaggle.com/mmueller/ # liberty-mutual-group-property-inspection-prediction/ # xgb-feature-importance-python/code ceate_feature_map(feature_names) importance = gbm.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) plt.title('XGBoost Feature Importance') plt.xlabel('relative importance') fig_featp = featp.get_figure() fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)
def xgbFull(X_train,y_train,X_test): params = {} params["objective"] = "reg:linear" params["eta"] = 0.02 params["min_child_weight"] = 6 params["subsample"] = 0.7 params["scale_pos_weight"] = 0.8 params["silent"] = 1 params["max_depth"] = 8 params["max_delta_step"]=2 plst = list(params.items()) xgtest = xgb.DMatrix(X_test) y1 = np.log1p(y_train) y2 = np.power(y_train,1/16.0) num_rounds = 1000 print(num_rounds) xgtrain = xgb.DMatrix(X_train,label=y1) m1 = xgb.train(plst,xgtrain,num_rounds) p1 = m1.predict(xgtest) p1 = np.expm1(p1) num_rounds = 2000 print(num_rounds) xgtrain = xgb.DMatrix(X_train,label=y2) m2 = xgb.train(plst,xgtrain,num_rounds) p2 = m2.predict(xgtest) p2 = np.power(p2,16.0) num_rounds = 3000 print(num_rounds) xgtrain = xgb.DMatrix(X_train,label=y1) m3 = xgb.train(plst,xgtrain,num_rounds) p3 = m3.predict(xgtest) p3 = np.expm1(p3) num_rounds = 4000 print(num_rounds) xgtrain = xgb.DMatrix(X_train,label=y2) m4 = xgb.train(plst,xgtrain,num_rounds) p4 = m4.predict(xgtest) p4 = np.power(p4,16.0) return p1,p2,p3,p4
def save_predictions_per_store(output_dir, train_set, train_features, valid_set, valid_features, model): print ">> SAVING PREDICTIONS PER STORE" train = pd.DataFrame(train_set) train["PredSales"] = np.expm1(model.predict(xgb.DMatrix(train_features))) valid = pd.DataFrame(valid_set) valid["PredSales"] = np.expm1(model.predict(xgb.DMatrix(valid_features))) train = train.iloc[::-1] valid = valid.iloc[::-1] for store in train.Store.unique(): df = train[train.Store == store].append(valid[valid.Store == store]) output_path = path.join(output_dir, "store_%s.csv" % store) df[["Store", "Open", "Promo", "Date", "Sales", "PredSales"]].to_csv(output_path, index=False)
def fit(self,x_train,y_train): batchsize = self.batchsize np.random.seed(self.seed) if self.cuda: cuda.get_device(0).use() self.model.to_gpu() xp = cuda.cupy else: xp = np self.xp = xp if self.split != 0.0: x_train_data, x_valid_data, y_train_data, y_valid_data = train_test_split(x_train, y_train, test_size=self.split, random_state=self.seed) print "train size:{} test_size:{}".format(len(x_train_data), len(y_valid_data)) data = np.array(x_train_data,dtype=np.float32) valid_data = np.array(x_valid_data,dtype=np.float32) target = np.array(y_train_data,dtype=np.float32).reshape((len(data),1)) valid_target = np.array(y_valid_data,dtype=np.float32).reshape((len(valid_data),1)) else: data = np.array(x_train,dtype=np.float32) target = np.array(self.convert(y_train_data),dtype=np.float32).reshape((len(data),1)) optimizer = optimizers.Adam() optimizer.setup(self.model) N = len(data) for epoch in xrange(self.epochs): print "epoch:",epoch perm = np.random.permutation(N) sum_loss = 0.0 sum_original_loss = 0.0 cnt = 0 for i in xrange(0,N,batchsize): x = chainer.Variable(xp.asarray(data[perm[i:i + batchsize]]),volatile="off") t = chainer.Variable(xp.asarray(target[perm[i:i + batchsize]],dtype=np.float32),volatile="off") optimizer.update(self.model, x, t) sum_original_loss += float(self.model.loss.data) * len(t.data) cnt += 1 if evaluate_function != None: prediction = self.predict(valid_data) loss = evaluate_function(np.expm1(valid_target),np.expm1(prediction),self.evaluate_function_name) sum_loss = loss print "original train loss:{}".format(sum_original_loss / N) print "train_loss:{}".format(sum_loss)
def int_linexp0(a, b, u0, u1, g, x0): """ This is the integral in [a, b] of u(x) * exp(g * (x0 - x)) * x assuming that u is linear with u({a, b}) = {u0, u1}.""" # Since u(x) is linear, we calculate separately the coefficients # of degree 0 and 1 which, after multiplying by the x in the integrand # correspond to 1 and 2 # The expressions involve the following exponentials that are problematic: # expa = np.exp(g * (-a + x0)) # expb = np.exp(g * (-b + x0)) # The problems come with small g: in that case, the exp() rounds to 1 # and neglects the order 1 and 2 terms that are required to cancel the # 1/g**2 and 1/g**3 below. The solution is to rewrite the expressions # as functions of expm1(x) = exp(x) - 1, which is guaranteed to be accurate # even for small x. expm1a = np.expm1(g * (-a + x0)) expm1b = np.expm1(g * (-b + x0)) ag = a * g bg = b * g ag1 = ag + 1 bg1 = bg + 1 g2 = g * g g3 = g2 * g # These are the expressions as functions of expa/expb # A1 = ( expa * ag1 # - expb * bg1) / g2 # A2 = (expa * (2 * ag1 + ag * ag) - # expb * (2 * bg1 + bg * bg)) / g3 A1 = (expm1a * ag1 + ag - expm1b * bg1 - bg) / g2 A2 = (expm1a * (2 * ag1 + ag * ag) + ag * (ag + 2) - expm1b * (2 * bg1 + bg * bg) - bg * (bg + 2)) / g3 # The factors multiplying each coefficient can be obtained by # the interpolation formula of u(x) = c0 + c1 * x c0 = (a * u1 - b * u0) / (a - b) c1 = (u0 - u1) / (a - b) r = c0 * A1 + c1 * A2 # Where either F0 or F1 is 0 we return 0 return np.where(np.isnan(r), 0.0, r)
for c, dtype in zip(test.columns, test.dtypes): if dtype == np.float64: test[c] = test[c].astype(np.float32) train_x = train.drop(['air_store_id', 'visit_date', 'visitors'], axis=1) train_y = np.log1p(train['visitors'].values) print(train_x.shape, train_y.shape) test_x = test.drop(['id', 'air_store_id', 'visit_date', 'visitors'], axis=1) # parameter tuning of xgboost # start from default setting boost_params = {'eval_metric': 'rmse'} xgb0 = xgb.XGBRegressor( max_depth=8, learning_rate=0.01, n_estimators=10000, objective='reg:linear', gamma=0, min_child_weight=1, subsample=1, colsample_bytree=1, scale_pos_weight=1, seed=27, **boost_params) xgb0.fit(train_x, train_y) predict_y = xgb0.predict(test_x) test['visitors'] = np.expm1(predict_y) test[['id', 'visitors']].to_csv( 'xgb0_submission.csv', index=False, float_format='%.3f') # LB0.495
def _cdf(self, x, p): k = floor(x) return -expm1(log1p(-p) * k)
def _stats(self, lambda_): mu = 1 / (exp(lambda_) - 1) var = exp(-lambda_) / (expm1(-lambda_))**2 g1 = 2 * cosh(lambda_ / 2.0) g2 = 4 + 2 * cosh(lambda_) return mu, var, g1, g2
# In[27]: def rmsle(y, y_pred): return np.sqrt(mean_squared_error(y, y_pred)) # In[28]: model_xgb.fit(train, y_train) joblib.dump(model_xgb, 'xgboost_model.joblib') xgb_train_pred = model_xgb.predict(train) xgb_pred = np.expm1(model_xgb.predict(test)) # In[29]: print('RMSLE score on train data:') print(rmsle(y_train, xgb_train_pred*0.10 )) # In[30]: # Example XGBoost = 1/(0.1177)
"bagging_fraction": 0.4, "bagging_freq": 1, "feature_fraction": 0.68, "lambda_l1": 10, } evals_result = {} model_lgb = lgbm.train(params, lgtrain, 5000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=50, evals_result=evals_result) lg_preds = pd.DataFrame(np.expm1(model_lgb.predict(x_submit))) lg_preds.insert(0, "ID", ids.values) lg_preds.columns = ["ID", "target"] lg_preds.to_csv("submit.csv", index=False) grouped = train.groupby('target') consolidated = pd.DataFrame(columns=train.columns[1:, ]) print(len(grouped)) i = 0 for name, group in grouped: if i % 50 == 0: print("XXXX") print(i) print("XXXX") consolidated = consolidated.append(group.mean(), ignore_index=True)
def neuron_and_output_weights(self, current): # reduce all refractory times by dt self.refractory_time -= self.dt # compute effective dt for each neuron, based on remaining time. # note that refractory times that have completed midway into this # timestep will be given a partial timestep delta_t = (self.dt - self.refractory_time).clip(0, self.dt) # update voltage using discretized lowpass filter # since v(t) = v(0) + (J - v(0))*(1 - exp(-t/tau)) assuming # J is constant over the interval [t, t + dt) #print(self.voltage.shape) #print(current.shape) #print(delta_t.shape) #print(self.tau_rc.shape) self.voltage -= (current - self.voltage) * np.expm1( -delta_t / self.tau_rc) self.voltage[self.voltage < 0] = 0 # this is only needed if we're doing learning self.learning_activity *= (1 - self.learning_scale) output = np.zeros(self.n_outputs) for i in range(self.n_neurons): # determine which neurons spiked this time step # NOTE: this will be very sparse, since few neurons spike at once if self.voltage[i] > 1: # compute when during the timestep the spike happened log_result = np.log1p(-(self.voltage[i] - 1) / (current[i] - 1)) t_spike = self.dt + self.tau_rc * log_result # use this time to set the refractory_time accurately self.refractory_time[i] = self.tau_ref + t_spike # set spiked voltages to zero, and rectify negative voltages to zero self.voltage[i] = 0 # do the low-pass filter needed for learning self.learning_activity[i] += self.learning_scale # handle the output connection weights output += self.decoders[:, i] ''' if self.obj_id == 1 and self.time <= 10 and i == 10: print("time: "+str(self.time)+", log result: "+str(log_result)) print("time: "+str(self.time)+", t spike: "+str(t_spike*1000)) ''' ''' if self.obj_id == 1: if self.debug_count == 0: for i in range(90,100): print(self.voltage[i]) #print("current: "+str(self.current)) self.debug_count = 1 ''' ''' if self.obj_id == 1: #print(current[90]) print(output) ''' return output
def expected2(a): return np.sum(np.expm1(a) + np.ceil(a + 0.5) * np.rint(a + 1.5))
def numpy_math2(a): sum = 0.0 for i in range(a.shape[0]): sum += np.expm1(a[i]) + np.ceil(a[i] + 0.5) * np.rint(a[i] + 1.5) return sum
def compound(r): """ returns the result of compounding the set of returns in r """ return np.expm1(np.log1p(r).sum())
def inst_to_ann(r): """ Convert an instantaneous interest rate to an annual interest rate """ return np.expm1(r)
# Validate #------------------------------------------------------------------------------------------# # Submit logger.info('Making submission...') y_test = np.array(test_pred).transpose() df_preds = pd.DataFrame( y_test, index=df_2017.index, columns=pd.date_range("2017-08-16", periods=16) ).stack().to_frame("unit_sales") df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True) submission = df_test[["id"]].join(df_preds, how="left").fillna(0) submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000) submission.to_csv('../submit/T016_tmp.csv', float_format='%.4f', index=None) ####### PZ, Check overral result print("SUM =", submission.unit_sales.sum()) print("MEAN =", submission.unit_sales.mean()) #------------------------------------------------------------------------------------------# df_prev = submission df_sub= pd.read_csv('../input/sub_zero3m.csv') t_new = pd.merge(df_prev, df_sub, on=['id'], how = 'left') t_new['unit_sales'] = t_new.unit_sales_y.combine_first(t_new.unit_sales_x)
if dtype == np.float64: test[c] = test[c].astype(np.float32) train_x = train.drop(['air_store_id', 'visit_date', 'visitors'], axis=1) train_y = np.log1p(train['visitors'].values) test_x = test.drop(['id', 'air_store_id', 'visit_date', 'visitors'], axis=1) print("\n [1] Przetworzono dane") print("\n [2]: Regresja liniowa..") reg = LinearRegression() reg.fit(train_x, train_y) LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) pred = reg.predict(test_x) test['visitors[linear]'] = np.expm1(pred) print(" Zrobione!") print("\n [3]: Drzewo decyzyjne..") reg = DecisionTreeRegressor(max_depth=20) reg.fit(train_x, train_y) DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0,
metric='rmse', is_training_metric=True, max_bin=55, bagging_fraction=0.8, verbose=-1, bagging_freq=5, feature_fraction=0.9) score = rmsle_cv(model_xgb) print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(model_lgb) print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) averaged_models = AveragingModels(models=(model_xgb, model_lgb)) score = rmsle_cv(averaged_models) print("averaged score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) averaged_models.fit(train.values, y_train) pred = np.expm1(averaged_models.predict(test.values)) ensemble = pred sub = pd.DataFrame() sub['ID'] = test_ID sub['target'] = ensemble sub.to_csv('submission.csv', index=False) #Xgboost score: 1.3582 (0.0640) #LGBM score: 1.3437 (0.0519) #averaged score: 1.3431 (0.0586) #Xgboost score: 1.3566 (0.0525) #LGBM score: 1.3477 (0.0497) #averaged score: 1.3438 (0.0516) #Xgboost score: 1.3540 (0.0621)
np.sqrt(mean_squared_error(df_submission.price.values, preds))) del submission_keras, df_submission gc.collect() submission_preds_df = pd.DataFrame(models_predictions) if split > 0: print( 'ENSEMBLE MEAN SCORE :', np.sqrt(mean_squared_error(sub_price, submission_preds_df.mean(axis=1)))) print(' ') from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(submission_preds_df.values, sub_price) preds = lr.predict(submission_preds_df.values) print('ENSEMBLE LR SCORE :', np.sqrt(mean_squared_error(sub_price, preds))) print(lr.coef_) if split == -1: mysubmission = pd.DataFrame() mysubmission['test_id'] = submission_idx preds = np.expm1(submission_preds_df.mean(axis=1)) preds[preds < 3] = 3 preds[preds > 1000] = 1000 mysubmission['price'] = preds mysubmission.to_csv('mean.csv', index=False) print(mysubmission.shape)