def group_log_means_predict_manual(df_train, df_test, vali): start_time = time.time() all_mean = df_train['log_Demanda_uni_equil'].mean() P_mean = df_train.groupby(by=['short_name'])['log_Demanda_uni_equil'].mean() C_mean = df_train.groupby(by=['Cliente_ID'])['log_Demanda_uni_equil'].mean() PA_mean = df_train.groupby(by=['short_name', 'Agencia_ID'])['log_Demanda_uni_equil'].mean() PR_mean = df_train.groupby(by=['short_name', 'Ruta_SAK'])['log_Demanda_uni_equil'].mean() PCA_mean = df_train.groupby(by=['short_name', 'Cliente_ID', 'Agencia_ID'])['log_Demanda_uni_equil'].mean() print 'mean calculating time=', time.time()-start_time start_time = time.time() if not vali: df_test['Demanda_uni_equil']=np.apply_along_axis((lambda x:log_means_pred_demand_manual_func(x,\ P_mean, C_mean, PA_mean, PR_mean, PCA_mean, all_mean)), 1, df_test.values) df_test.to_csv('output/'+'manual_group_log_mean_'+ \ str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))+'.csv', \ columns=['id','Demanda_uni_equil'], index=False) else: # global pred_demand, true_demand pred_demand = np.apply_along_axis((lambda x:log_means_pred_demand_manual_func(x, \ P_mean, C_mean, PA_mean, PR_mean, PCA_mean, all_mean)), 1, df_test[labels].values) true_demand = df_test['Demanda_uni_equil'].values RMSLE = np.sqrt(MSE(np.log1p(pred_demand), np.log1p(true_demand))) print 'RMSLE=', RMSLE print 'predicting time=', time.time()-start_time
def run_cv(self, num_round, params): ''' Using FoldTubeID split, loop over CV to get RMSLE for each split params is a list of parameters for XGBoost. After finishing CV, run score() to get the results ''' self.pred = [] self.real = [] if len(params) == 0: raise ValueError('Please read in parameters') for tr, te in self.cv: self.train = self.trainset.loc[tr,:].copy() self.test = self.trainset.loc[te,:].copy() # Randomize and set seed # np.random.permutation(len(trainp1)) np.random.seed(1) self.train = self.train.iloc[np.random.permutation(len(self.train))] np.random.seed(2) self.test = self.test.iloc[np.random.permutation(len(self.test))] y_real = np.array(self.test.iloc[:,-1]) # Section for training multi-models if you like y_pred_xgb = xgboost_model(self.train, self.test, num_round, params) y_pred = y_pred_xgb self.pred += [y_pred] self.real += [y_real] self.rmsle_score += [np.sqrt(mean_squared_error(np.log1p(y_real), np.log1p(y_pred)))] print '===========================================================' print 'Finished Cross-validation' print '==========================================================='
def _gpinv(p, k, sigma): """Inverse Generalized Pareto distribution function""" x = np.full_like(p, np.nan) if sigma <= 0: return x ok = (p > 0) & (p < 1) if np.all(ok): if np.abs(k) < np.finfo(float).eps: x = - np.log1p(-p) else: x = np.expm1(-k * np.log1p(-p)) / k x *= sigma else: if np.abs(k) < np.finfo(float).eps: x[ok] = - np.log1p(-p[ok]) else: x[ok] = np.expm1(-k * np.log1p(-p[ok])) / k x *= sigma x[p == 0] = 0 if k >= 0: x[p == 1] = np.inf else: x[p == 1] = - sigma / k return x
def transform_log(series, robust=True): """Perform element-wise logarithm transformation in a numerical series. Parameters ---------- series : pandas.Series series to transform robust : bool True - handle negative and zero values properly False - transform negative value to nan, zero to -inf Returns ------- log_series : pandas.Series ANOTHER series consisting of the transformed values """ # TODO: support log10 # TODO: separate log1p and log explicitly if not isinstance(series, pd.Series): raise TypeError("argument 'series' is NOT 'pandas.Series' type") if not is_numerical_type(series): raise ValueError("value type of argument 'series' is NOT numerical") if robust: return series.apply(lambda x: np.log1p(x) if x>=0 else -np.log1p(-x)) else: return series.apply(np.log)
def transform(self, X): if self.columns: for column in self.columns: X[column] = np.log1p(X[column]) return X else: return np.log1p(X)
def __init__(self, past, future, features = None): """Create a training pattern. Parameters: past -- past feature vectors as a tensor of shape [P, V] where P is past days and V is the vectors/day future -- future feature vectors as a tensor of [F, V] where F is future days and V is the vectors/day features -- a sequence of feature names to use where None means use all features """ # calculate training input from past features past_subfeatures = [[self._subfeatures(vector, features) for vector in vectors] for vectors in past] self._input = numpy.array( [list(util.flatten(vectors)) for vectors in past_subfeatures]) # calculate training output from future volatility future_returns = numpy.log1p( [[vector.ret for vector in vectors] for vectors in future]) self._output = numpy.std(future_returns, axis = 0, ddof = 1)\ * numpy.sqrt(252) # calculate past returns for forecasts self._past_returns = numpy.log1p( [[vector.ret for vector in vectors] for vectors in past])
def compute_weights(data, Nlive): """Returns log_ev, log_wts for the log-likelihood samples in data, assumed to be a result of nested sampling with Nlive live points.""" start_data=concatenate(([float('-inf')], data[:-Nlive])) end_data=data[-Nlive:] log_wts=zeros(data.shape[0]) log_vols_start=cumsum(ones(len(start_data)+1)*log1p(-1./Nlive))-log1p(-1./Nlive) log_vols_end=np.zeros(len(end_data)) log_vols_end[-1]=np.NINF log_vols_end[0]=log_vols_start[-1]+np.log1p(-1.0/Nlive) for i in range(len(end_data)-1): log_vols_end[i+1]=log_vols_end[i]+np.log1p(-1.0/(Nlive-i)) log_likes = concatenate((start_data,end_data,[end_data[-1]])) log_vols=concatenate((log_vols_start,log_vols_end)) log_ev = log_integrate_log_trap(log_likes, log_vols) log_dXs = logsubexp(log_vols[:-1], log_vols[1:]) log_wts = log_likes[1:-1] + log_dXs[:-1] log_wts -= log_ev return log_ev, log_wts
def exp1(): train,y,test,idx = get_data_1() train = np.log1p(train.astype(float)) test = np.log1p(test.astype(float)) scaler = StandardScaler().fit(train) train = scaler.transform(train) test = scaler.transform(test) mtrain = pd.read_csv('meta_features_train.csv') mtest = pd.read_csv('meta_features_test.csv') scaler2 = StandardScaler().fit(mtrain) mtrain = scaler2.transform(mtrain) mtest = scaler2.transform(mtest) train = np.column_stack((train,mtrain)) test = np.column_stack((test,mtest)) rtrain_nn,rtest_nn = nn_features(train,y,test,model=build_nn2,random_state=1,n_folds=5,early_stop=50) rtrain_nn_total = rtrain_nn rtest_nn_total = rtest_nn for i in range(9): rand_seed = i*113+9201 rtrain_nn,rtest_nn = nn_features(train,y,test,model=build_nn2,random_state=rand_seed,n_folds=5,early_stop=50) rtrain_nn_total += rtrain_nn rtest_nn_total += rtest_nn pd.DataFrame(data=rtrain_nn_total).to_csv('rtrain_nn_last.csv',index=False) pd.DataFrame(data=rtest_nn_total).to_csv('rtest_nn_last.csv',index=False) pd.DataFrame(data=rtrain_nn_total/10).to_csv('rtrain_nn_final.csv',index=False) pd.DataFrame(data=rtest_nn_total/10).to_csv('rtest_nn_final.csv',index=False)
def my_logaddexp(a, b): tmp = a - b return np.select([a == b, tmp > 0, tmp <= 0], [ a + 0.69314718055994529, a + np.log1p(np.exp(-tmp)), b + np.log1p(np.exp(tmp)) ], default=tmp)
def keras_cv(self, params): """ Using FoldTubeID split, loop over CV to get RMSLE for each split params is a list of parameters for Keras Neural Networks. After finishing CV, run score() to get the results """ self.pred = [] self.real = [] if len(params) == 0: raise ValueError("Please read in parameters") for tr, te in self.cv: self.train = self.trainset.loc[tr, :].copy() self.test = self.trainset.loc[te, :].copy() # Randomize and set seed # np.random.permutation(len(trainp1)) np.random.seed(1) self.train = self.train.iloc[np.random.permutation(len(self.train))] np.random.seed(2) self.test = self.test.iloc[np.random.permutation(len(self.test))] y_real = np.array(self.test.iloc[:, -1]) # Section for training multi-models if you like y_pred = keras_model(self.train, self.test, params) self.pred += [y_pred] self.real += [y_real] self.rmsle_score += [np.sqrt(mean_squared_error(np.log1p(y_real), np.log1p(y_pred)))] print "===========================================================" print "Finished Keras Cross-validation" print "==========================================================="
def getval(self, keys): array = tuple(keys) array = np.unique(array) array_length = len(array) rsmleValues_array = [] count = 0 maxValue = np.amax(array) minValue = np.amin(array) for i in array: count = 0 for j in array: count = count + (np.log1p(i) - np.log1p(j))**2 rsmleValues_array.append(np.sqrt(count / array_length)) count = -1; index = 0 min_error_value = rsmleValues_array[0] for val in rsmleValues_array: count += 1 if val < min_error_value: min_error_value = val index = count demand = array[index] return demand
def logsum_pair(logx, logy): """ Return log(x+y), avoiding arithmetic underflow/overflow. logx: log(x) logy: log(y) Rationale: x + y = e^logx + e^logy = e^logx (1 + e^(logy-logx)) log(x+y) = logx + log(1 + e^(logy-logx)) (1) Likewise, log(x+y) = logy + log(1 + e^(logx-logy)) (2) The computation of the exponential overflows earlier and is less precise for big values than for small values. Due to the presence of logy-logx (resp. logx-logy), (1) is preferred when logx > logy and (2) is preferred otherwise. """ if logx == logzero(): return logy elif logx > logy: return logx + np.log1p(np.exp(logy - logx)) else: return logy + np.log1p(np.exp(logx - logy))
def __init__(self, daily_returns, benchmark_daily_returns, risk_free_rate, days, period=DAILY): assert(len(daily_returns) == len(benchmark_daily_returns)) self._portfolio = daily_returns self._benchmark = benchmark_daily_returns self._risk_free_rate = risk_free_rate self._annual_factor = _annual_factor(period) self._daily_risk_free_rate = self._risk_free_rate / self._annual_factor self._alpha = None self._beta = None self._sharpe = None self._return = np.expm1(np.log1p(self._portfolio).sum()) self._annual_return = (1 + self._return) ** (365 / days) - 1 self._benchmark_return = np.expm1(np.log1p(self._benchmark).sum()) self._benchmark_annual_return = (1 + self._benchmark_return) ** (365 / days) - 1 self._max_drawdown = None self._volatility = None self._annual_volatility = None self._benchmark_volatility = None self._benchmark_annual_volatility = None self._information_ratio = None self._sortino = None self._tracking_error = None self._annual_tracking_error = None self._downside_risk = None self._annual_downside_risk = None self._calmar = None self._avg_excess_return = None
def pdf(self, x: Array, log=False): n, d = x.shape theta = self.params ok = valid_rows_in_u(x) log_pdf = np.repeat(np.nan, n) if not ok.any(): return log_pdf elif theta == 0: log_pdf[ok] = 0 return log_pdf lu = np.log(x).sum(1) t = self.ipsi(x).sum(1) if theta < 0: # dim == 2 pos_t = t < 1 log_pdf = np.log1p(theta) - (1 + theta) * lu - (d + 1 / theta) * np.log1p(-t) log_pdf[~ok] = np.nan log_pdf[ok & ~pos_t] = -np.inf else: p = np.log1p(theta * np.arange(1, d)).sum() log_pdf = p - (1 + theta) * lu - (d + 1 / theta) * np.log1p(t) return log_pdf if log else np.exp(log_pdf)
def fit ( self , X , y ): N = len( y ) # num of happy tweets N_1 = np.sum( y ) # num of sad tweets N_0 = N - N_1 # ratio of happy/sad tweet Pi_0 = ( N_0 + 2 / N ) Pi_1 = ( N_1 + 2 / N ) #output is an array, N_jc[0] is the count #of how many 'obamas' when happy/sad N_j0 = (1-y)*X N_j1 = y*X Theta_j0 = ( ( N_j0 + 1 ) / ( N_0 + 2 ) ) Theta_j1 = ( ( N_j1 + 1 ) / ( N_1 + 2 ) ) logpi = [ np.log( Pi_0 ), np.log( Pi_1 ) ] self.logpi = np.array( logpi ) self.logtheta = np.array([ np.log( Theta_j0 ), np.log( Theta_j1 ) ]) self.log1theta = np.array( [ np.log1p( -1*Theta_j0 ), np.log1p( -1*Theta_j0 ) ] ) save_params( self, 'params' )
def log_likelihood_state(params,sender,time): #params = [theta,A,alpha,delta,epsilon,sigma] tol = 1e-24 theta = float(params[0]) alpha = float(params[2]) if min(theta,alpha)<0: ll = -float('inf') else: (S,X,SX,m1,m2,N)= sufficient_statistics(sender,time) if theta < 0: theta = 0 if 1 - theta + tol < 0: theta = 1 puu = alpha*np.log(theta+tol) pvv = alpha*np.log(1-theta+tol) puv = np.log1p(-np.exp(alpha*np.log(theta+tol))) pvu = np.log1p(-np.exp(alpha*np.log(1-theta+tol))) try: ll = (N[0]*puu+N[1]*puv+ N[2]*pvu+N[3]*pvv) except: print 'll error: theta = %s, alpha = %s'%(theta,alpha) ll=0 return -ll #take negative for minimization
def _logistic(X, y, w): """Compute the logistic function of the data: sum(sigmoid(yXw)) Parameters ---------- X : ndarray, shape (n_samples, n_features) Design matrix. y : ndarray, shape (n_samples,) Target / response vector. Each entry must be +1 or -1. w : ndarray, shape (n_features,) Unmasked, ravelized input map. Returns ------- energy : float Energy contribution due to logistic data-fit term. """ z = np.dot(X, w[:-1]) + w[-1] yz = y * z idx = yz > 0 out = np.empty_like(yz) out[idx] = np.log1p(np.exp(-yz[idx])) out[~idx] = -yz[~idx] + np.log1p(np.exp(yz[~idx])) out = out.sum() return out
def stitch(record1, record2): seq1 = array([record1.seq.tostring()]) seq2 = array([reverse_complement(record2.seq.tostring())]) seq1.dtype = '|S1' seq2.dtype = '|S1' quals1 = array(record1.letter_annotations['phred_quality']) quals2 = array(record2.letter_annotations['phred_quality'][::-1]) log10p_consensus_1 = log1p(-power(10, -quals1 / 10.)) / log(10) log10p_consensus_2 = log1p(-power(10, -quals2 / 10.)) / log(10) log10p_error_1 = -log10(3) - (quals1 / 10.) log10p_error_2 = -log10(3) - (quals2 / 10.) min_overlap = 1 max_overlap = max(len(record1), len(record2)) overlaps = {} for overlap in range(1, max_overlap): s1 = seq1[-overlap:] s2 = seq2[:overlap] q1 = quals1[-overlap:] q2 = quals2[:overlap] lpc1 = log10p_consensus_1[-overlap:] lpc2 = log10p_consensus_2[:overlap] lpe1 = log10p_error_1[-overlap:] lpe2 = log10p_error_2[:overlap] consensus = choose(q1 < q2, [s1, s2]) score = sum(choose(consensus == s1, [lpe1, lpc1])) + sum(choose(consensus == s2, [lpe2, lpc2])) + len(consensus) * log10(4) * 2 # last term is null hypothesis, p=1/4 consensus.dtype = '|S%i' % len(consensus) overlaps[overlap] = (consensus[0],score) return overlaps
def ndcg_at_k( rating_true, rating_pred, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, relevancy_method="top_k", k=DEFAULT_K, threshold=DEFAULT_THRESHOLD, ): """Normalized Discounted Cumulative Gain (nDCG). Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain Args: rating_true (pd.DataFrame): True DataFrame rating_pred (pd.DataFrame): Predicted DataFrame col_user (str): column name for user col_item (str): column name for item col_rating (str): column name for rating col_prediction (str): column name for prediction relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold'] k (int): number of top k items per user threshold (float): threshold of top items per user (optional) Returns: float: nDCG at k (min=0, max=1). """ df_hit, df_hit_count, n_users = merge_ranking_true_pred( rating_true=rating_true, rating_pred=rating_pred, col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, relevancy_method=relevancy_method, k=k, threshold=threshold, ) if df_hit.shape[0] == 0: return 0.0 # calculate discounted gain for hit items df_dcg = df_hit.copy() # relevance in this case is always 1 df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"]) # sum up discount gained to get discount cumulative gain df_dcg = df_dcg.groupby(col_user, as_index=False).agg({"dcg": "sum"}) # calculate ideal discounted cumulative gain df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user]) df_ndcg["idcg"] = df_ndcg["actual"].apply( lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1))) ) # DCG over IDCG is the normalized DCG return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users
def Devroye(N, dvc, delta): tol = 1e-5 prev_result = 0.5 result = np.sqrt((4*prev_result*(1+prev_result + np.log1p(4/delta) + np.log1p(N)*2*dvc))/(2*N)) while abs(result - prev_result) > tol: prev_result = result result = np.sqrt((4*prev_result*(1+prev_result + np.log1p(4/delta) + np.log1p(N)*2*dvc))/(2*N)) return result
def preprocess(df): df = create_datetime_features(df) df['period'] = df.datetime.map(calculate_period) if 'count' in df.columns: df['log_count'] = np.log1p(df['count']) df['log_registered'] = np.log1p(df['registered']) df['log_casual'] = np.log1p(df['casual']) return df
def rmsle(pred, ans): """ [list of ints], [list of ints] -> float Calculate the RMS Log Error between a set of predictions, and their correponding answers. """ no_samps = float(len(pred)) err = math.sqrt( 1.0/no_samps * np.sum((np.log1p(np.float64(pred)) - np.log1p(np.float64(ans)))**2.0)) return err
def rmsle(y_true, y_pred): loss_sum = 0 loss_count = 0 for t, p in zip(y_true.values, y_pred): loss_sum += (np.log1p(t[0]) - np.log1p(p))**2 loss_count += 1 return np.sqrt(loss_sum/loss_count)
def test_correct(self): self.assertAllClose( self.evaluate(tfp.vi.modified_gan(self._logu)), np.log1p(self._u) - self._logu) self.assertAllClose( self.evaluate(tfp.vi.modified_gan(self._logu, self_normalized=True)), np.log1p(self._u) - self._logu + 0.5 * (self._u - 1))
def evalerror(preds, dtrain): labels = dtrain.get_label(); n=len(labels); preds = np.log1p(np.power(preds,16.0)) labels = np.log1p(np.power(labels,16.0)) delta_error=(preds-labels); error_metric=np.sqrt((pow(np.linalg.norm(delta_error),2))/n); return 'error', error_metric
def test_correct(self): with self.test_session(): self.assertAllClose( cd.modified_gan(self._logu).eval(), np.log1p(self._u) - self._logu) self.assertAllClose( cd.modified_gan(self._logu, self_normalized=True).eval(), np.log1p(self._u) - self._logu + 0.5 * (self._u - 1))
def var_transform_log(df,var_name): col = np.array(df[var_name]) if col.min() >= 0: col_sqrt = np.log1p(col) df[var_name+"_Log"] = col_sqrt elif col.max() <= 0: col_sqrt = np.log1p(-col) df[var_name+"_NegLog"] = col_sqrt return df
def test_run(fn, features, type): """ load dataset, build feature set, and do learning Parameters ---------- fn: file name of dataset features: a list of list, each of which is a feature list for different models type: str for indicating feature set Returns ------- predictions and feature-engineered dataset are saved to files """ np.set_printoptions(precision=4) print('test_run ' + type) df = load_data(fn) check_df(df) df = feature_engineering(df) print(df.columns) # print(df.head()) # print(df.groupby(['peak_hr'])['cnt'].agg(sum)) y_pred_list = [] for i, est in enumerate(( DecisionTreeRegressor(min_samples_split=20), ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=1234), RandomForestRegressor(n_estimators=1000, max_depth=15, random_state=1234, min_samples_split=3, n_jobs=-1), GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=0, min_samples_leaf=20, learning_rate=0.1, subsample=0.7, loss='ls'), svm.SVR(C=30) )): # print(features[i]) df, X_train, X_test, y_train, y_test, y_train_cas, y_test_cas, y_train_reg, y_test_reg, time_test = split_data(df, features=features[i]) y_pred, mse = predict_evaluate(est, X_train, y_train, X_test, y_test) est_name = str(est).split('(')[0] print(type, est_name, np.round(mse, 4)) """ feature importance if est_name != 'SVR': # print out feature importance sfi = sorted([(x[0], float('%.4f'%x[1])) for x in zip(features[i], est.feature_importances_)], key=lambda x: x[1], reverse=True) print(sfi) print([x[0] for x in sfi]) """ y_pred_list.append([est_name, mse, y_pred]) # blending models y_pred_blend = np.log1p(.2*(np.exp(y_pred_list[2][2])-1) + .8*(np.exp(y_pred_list[3][2])-1)) print(type+' blending: 0.2*'+y_pred_list[2][0]+' + 0.8*'+y_pred_list[3][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) y_pred_blend = np.log1p(.3*(np.exp(y_pred_list[1][2])-1) + .7*(np.exp(y_pred_list[3][2])-1)) print(type+' blending: 0.3*'+y_pred_list[1][0]+' + 0.7*'+y_pred_list[3][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) y_pred_blend = np.log1p(.3*(np.exp(y_pred_list[3][2])-1) + .7*(np.exp(y_pred_list[4][2])-1)) print(type+ ' blending: 0.2*'+y_pred_list[3][0]+' + 0.8*'+y_pred_list[4][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) y_pred_blend = np.log1p(.6*(np.exp(y_pred_list[3][2])-1) + .4*(np.exp(y_pred_list[4][2])-1)) print(type+ ' blending: 0.6*'+y_pred_list[3][0]+' + 0.4*'+y_pred_list[4][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) dff = pd.DataFrame({'datetime': time_test[:, 0], 'mnth': time_test[:, 1], 'hr': time_test[:, 2], 'cnt': np.expm1(y_test), 'prediction': y_pred_blend}) dff.to_csv('../output/prediction_blended.csv', index = False, columns=['datetime', 'mnth', 'hr', 'cnt', 'prediction']) print('blended predictions saved in ../output/prediction_blended.csv') df.to_csv('../data/hour_ext.csv') print('extended dataset saved in ../data/hour_ext.csv')
def inspect_zeros(trainer, filedir, inspect=None, FIGWIDTH=FIGWIDTH, FIGHEIGHT=FIGHEIGHT): '''Produce side-by-side log histograms.''' plt.close() complete = [] D = trainer.now.copy() if not inspect: inspect = D.columns.tolist() save_this_directory = filedir + '/{}'.format(trainer.name) save_this_here = save_this_directory + '/zeros' try: os.mkdir(filedir) except: pass try: os.mkdir(save_this_directory) except: pass try: os.mkdir(save_this_here) except: pass for feature in inspect: print('Inspect {} for Zeros'.format(feature)) plt.close() for x in inspect: if x != feature and (x, feature) not in complete: compare = (x, feature) complete += [compare] try: fig, axs = plt.subplots(figsize=(FIGWIDTH, FIGHEIGHT)) np.log1p(D[D[feature] == 0][x]).hist(bins=30, label ='{} == 0'.format(feature), normed=True) np.log1p(D[D[feature] > 0][x]).hist(bins=30, label ='{} > 0'.format(feature), normed=True).legend(loc='upper right') t = "Log {} | {} = Zero.".format(x, feature) plt.title(t) axs.grid(False) doc = '{}/{}.png'.format(save_this_here,'inspect_{}_when_{}_zero'.format(x, feature)) plt.tight_layout() plt.savefig(doc) except: plt.close('all') pass plt.close() fig, axs = plt.subplots(figsize=(FIGWIDTH, FIGHEIGHT)) tag = '{}_pairplot_when_zero'.format(feature) t = "General Distribution | {} = Zero.".format(feature) doc = '{}/{}.png'.format(save_this_here, tag) try: g = sns.pairplot(data=D[D[feature]==0][inspect].dropna(), hue = trainer.target, palette="Set1", ax=axs) plt.title(t) plt.tight_layout() fig.savefig(doc) except: plt.close('all') pass plt.close('all')
def _calc_Jeff(inds, l_x, J): """ Coupling between two indices """ x, y = inds / l_x, inds % l_x dist = np.abs(x[1:] - x[:-1]) + np.abs(y[1:] - y[:-1]) res = np.tanh(J) ** dist res = .5 * np.log1p(res) - .5 * np.log1p(-res) return res
def highly_variable_genes(adata, min_disp=None, max_disp=None, min_mean=None, max_mean=None, n_top_genes=None, n_bins=20, flavor='seurat', binning_method='equal_width', subset=False, inplace=True): """Annotate highly variable genes [Satija15]_ [Zheng17]_. Expects logarithmized data. Depending on `flavor`, this reproduces the R-implementations of Seurat [Satija15]_ and Cell Ranger [Zheng17]_. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected. Parameters ---------- adata : :class:`~anndata.AnnData` The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. min_mean : `float`, optional (default: 0.0125) If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. max_mean : `float`, optional (default: 3) If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. min_disp : `float`, optional (default: 0.5) If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. max_disp : `float`, optional (default: `None`) If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the normalized dispersions are ignored. n_top_genes : `int` or `None`, optional (default: `None`) Number of highly-variable genes to keep. n_bins : `int`, optional (default: 20) Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed about this if you set `settings.verbosity = 4`. flavor : `{'seurat', 'cell_ranger'}`, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. In their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes `n_top_genes`. binning_method : `{'equal_width', 'equal_frequency'}`, optional (default: 'equal_width') Choose the binning method for the means. In `equal_width`, each bin covers the same width. For `equal_frequency`, each bin has an equal number of genes. subset : `bool`, optional (default: `False`) Inplace subset to highly-variable genes if `True` otherwise merely indicate highly variable genes. inplace : `bool`, optional (default: `True`) Whether to place calculated metrics in `.var` or return them. Returns ------- :class:`~numpy.recarray`, `None` Depending on `inplace` returns calculated metrics (:class:`~numpy.recarray`) or updates `.var` with the following fields * `highly_variable` - boolean indicator of highly-variable genes * `means` - means per gene * `dispersions` - dispersions per gene * `dispersions_norm` - normalized dispersions per gene Notes ----- This function replaces :func:`~scanpy.pp.filter_genes_dispersion`. """ logg.msg('extracting highly variable genes', r=True, v=4) if not isinstance(adata, AnnData): raise ValueError( '`pp.highly_variable_genes` expects an `AnnData` argument, ' 'pass `inplace=False` if you want to return a `np.recarray`.') if n_top_genes is not None and not all([ min_disp is None, max_disp is None, min_mean is None, max_mean is None ]): logg.info('If you pass `n_top_genes`, all cutoffs are ignored.') if min_disp is None: min_disp = 0.5 if min_mean is None: min_mean = 0.0125 if max_mean is None: max_mean = 3 X = np.expm1(adata.X) if flavor == 'seurat' else adata.X mean, var = materialize_as_ndarray(_get_mean_var(X)) # now actually compute the dispersion mean[mean == 0] = 1e-12 # set entries equal to zero to small value dispersion = var / mean if flavor == 'seurat': # logarithmized mean as in Seurat dispersion[dispersion == 0] = np.nan dispersion = np.log(dispersion) mean = np.log1p(mean) # all of the following quantities are "per-gene" here df = pd.DataFrame() df['mean'] = mean df['dispersion'] = dispersion if flavor == 'seurat': if binning_method == 'equal_width': df['mean_bin'] = pd.cut(df['mean'], bins=n_bins) elif binning_method == 'equal_frequency': df['mean_bin'] = pd.qcut(df['mean'], q=n_bins, duplicates='drop') else: raise ValueError( '`binning_method` needs to be "equal_width" or "equal_frequency"' ) disp_grouped = df.groupby('mean_bin')['dispersion'] disp_mean_bin = disp_grouped.mean() disp_std_bin = disp_grouped.std(ddof=1) # retrieve those genes that have nan std, these are the ones where # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() gen_indices = np.where( one_gene_per_bin[df['mean_bin'].values])[0].tolist() if len(gen_indices) > 0: logg.msg( 'Gene indices {} fell into a single bin: their ' 'normalized dispersion was set to 1.\n ' 'Decreasing `n_bins` will likely avoid this effect.'.format( gen_indices), v=4) # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32, # but there’s still a dtype error without “.value”. disp_std_bin[one_gene_per_bin.values] = disp_mean_bin[ one_gene_per_bin.values].values disp_mean_bin[one_gene_per_bin.values] = 0 # actually do the normalization df['dispersion_norm'] = (( df['dispersion'].values # use values here as index differs - disp_mean_bin[df['mean_bin'].values].values) / disp_std_bin[df['mean_bin'].values].values) elif flavor == 'cell_ranger': from statsmodels import robust df['mean_bin'] = pd.cut( df['mean'], np.r_[-np.inf, np.percentile(df['mean'], np.linspace(10, 100, n_bins - 1)), np.inf]) disp_grouped = df.groupby('mean_bin')['dispersion'] disp_median_bin = disp_grouped.median() # the next line raises the warning: "Mean of empty slice" with warnings.catch_warnings(): warnings.simplefilter('ignore') disp_mad_bin = disp_grouped.apply(robust.mad) df['dispersion_norm'] = ( np.abs(df['dispersion'].values - disp_median_bin[df['mean_bin'].values].values) / disp_mad_bin[df['mean_bin'].values].values) else: raise ValueError('`flavor` needs to be "seurat" or "cell_ranger"') dispersion_norm = df['dispersion_norm'].values.astype('float32') if n_top_genes is not None: dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)] dispersion_norm[::-1].sort( ) # interestingly, np.argpartition is slightly slower disp_cut_off = dispersion_norm[n_top_genes - 1] gene_subset = np.nan_to_num( df['dispersion_norm'].values) >= disp_cut_off logg.msg( 'the {} top genes correspond to a normalized dispersion cutoff of'. format(n_top_genes, disp_cut_off), v=5, ) else: max_disp = np.inf if max_disp is None else max_disp dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce(( mean > min_mean, mean < max_mean, dispersion_norm > min_disp, dispersion_norm < max_disp, )) logg.msg(' finished', time=True, v=4) if inplace or subset: logg.hint('added\n' ' \'highly_variable\', boolean vector (adata.var)\n' ' \'means\', float vector (adata.var)\n' ' \'dispersions\', float vector (adata.var)\n' ' \'dispersions_norm\', float vector (adata.var)') adata.var['highly_variable'] = gene_subset adata.var['means'] = df['mean'].values adata.var['dispersions'] = df['dispersion'].values adata.var['dispersions_norm'] = df['dispersion_norm'].values.astype( 'float32', copy=False) if subset: adata._inplace_subset_var(gene_subset) else: arrays = (gene_subset, df['mean'].values, df['dispersion'].values, df['dispersion_norm'].values.astype('float32', copy=False)) dtypes = [ ('highly_variable', np.bool_), ('means', 'float32'), ('dispersions', 'float32'), ('dispersions_norm', 'float32'), ] return np.rec.fromarrays(arrays, dtype=dtypes)
def _cdf(self, x, p): k = floor(x) return -expm1(log1p(-p) * k)
def _ppf(self, q, lambda_): vals = ceil(-1.0 / lambda_ * log1p(-q) - 1) vals1 = (vals - 1).clip(self.a, np.inf) temp = self._cdf(vals1, lambda_) return np.where(temp >= q, vals1, vals)
def std_income(df: pd.DataFrame) -> None: """Change "$84,835.00 " to float; then get log1p""" df[cst.H_INCOME] = np.log1p( df[cst.H_INCOME].map(lambda s: float(s[1:-1].replace(',', '')) if isinstance(s, str) else np.nan)) df[cst.H_INCOME].fillna(df[cst.H_INCOME].mean(), inplace=True)
def update_map(region, color_var, size_var, map_layout_data): print(region, color_var, size_var) if region: trd_selection = trd[trd.CONJ.isin(region)] color_norm = trd_selection[color_var].clip( 0, trd_selection[color_var].quantile(0.9)) if size_var == 'FIX_SIZE': size_norm = 10 trd_selection['FIX_SIZE'] = trd_selection[color_var] else: trd_max = trd_selection[size_var].quantile(0.95) size_norm = np.log1p(trd_selection[size_var] / trd_max) * 30 size_norm.clip(6, 25, inplace=True) info = trd_selection.FIC.map('<b>Frec Corte:</b> {:,.2f}'.format) + \ trd_selection.DIC.map('<br><b>Dur Corte:</b> {:,.2f}'.format) + \ trd_selection.ENE_12.map('<br><b>Consumo:</b> {:,.2f}'.format) map_data = [ go.Scattermapbox( lat=trd_selection.lat, lon=trd_selection.lon, text=info, hoverinfo='text', mode='markers', marker=dict(size=size_norm, color=color_norm, colorscale='RdBu', showscale=True, opacity=0.7), ) ] side_graph = dcc.Graph( figure=go.Figure(data=[ go.Line(x=trd_selection[color_var], y=trd_selection[size_var], mode='markers', name='Correlacion') ], layout=go.Layout( title='Correlacion Entre Variables', margin=dict(l=20, t=50, b=20, r=20), ))) if map_layout_data: print(map_layout_data) print(map_layout_data.keys()) if 'mapbox.center' in map_layout_data.keys(): # Lock Camera Position cam_lat = float(map_layout_data['mapbox.center']['lat']) cam_lon = float(map_layout_data['mapbox.center']['lon']) cam_zoom = float(map_layout_data['mapbox.zoom']) map_layout.mapbox.center.lat = cam_lat map_layout.mapbox.center.lon = cam_lon map_layout.mapbox.zoom = cam_zoom else: map_data = [go.Scattermapbox(lat=[], lon=[], mode='markers')] side_graph = [] return dict(data=map_data, layout=map_layout), side_graph
params = { "objective": "reg:linear", "booster": "gbtree", "eta": 0.3, "max_depth": 10, "subsample": 0.9, "colsample_bytree": 0.7, "silent": 1, "seed": 1301 } num_boost_round = 300 print("Train a XGBoost model") X_train, X_valid = train_test_split(train, test_size=0.5, random_state=10) y_train = np.log1p(X_train.unit_sales) y_valid = np.log1p(X_valid.unit_sales) dtrain = xgb.DMatrix(X_train[features], y_train) dvalid = xgb.DMatrix(X_valid[features], y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] model_xgb = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \ early_stopping_rounds=20, verbose_eval=True) create_feature_map(features) importance = model_xgb.get_fscore(fmap='xgb.fmap') print(importance) #------------------------------------------------------------------------------------- #Load test #test = valid
# 1.找出最接近的norm分布曲线 sns.distplot(train['SalePrice'], fit=norm) plt.title('SalePrice before normalized') (mu, sigma) = norm.fit(train['SalePrice']) print('正态化之前房价的分布拟合:') print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) plt.legend( ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.show() # 2.用QQ图判断数据是否为正态分布,蓝点和红线越重合就越符合正态分布 fig, ax = plt.subplots(1, 1, figsize=(12, 8)) stats.probplot(train['SalePrice'], plot=ax) plt.show() # 对房价取log让它趋近于正态分布 train['SalePrice'] = np.log1p(train['SalePrice']) # 3.变换后的房价曲线 sns.distplot(train['SalePrice'], fit=norm) plt.title('SalePrice after normalized') (mu, sigma) = norm.fit(train['SalePrice']) print('正态化之后房价的分布拟合:') print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) plt.legend( ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.show() # 先合并找出缺失比例最多的前20名 n_train = train.shape[0]
# print missing_data.head(20) df_train = train_df.drop((missing_data[missing_data['Total'] > 250]).index, 1) # print df_train.isnull().sum().max() #deleting points df_train.sort_values(by='GrLivArea', ascending=False)[:2] df_train = df_train.drop(df_train[df_train['Id'] == 1299].index) df_train = df_train.drop(df_train[df_train['Id'] == 524].index) # concat函数相当于拼接,拼接方式是增加行数,不增加列数 all_data = pd.concat((df_train.loc[:, 'MSSubClass':'SaleCondition'], test_df.loc[:, 'MSSubClass':'SaleCondition'])) #log transform the target: df_train["SalePrice"] = np.log1p(df_train["SalePrice"]) size_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1} size_mapping2 = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NA': 1} all_data['ExterQual'] = all_data['ExterQual'].map(size_mapping) all_data['ExterCond'] = all_data['ExterCond'].map(size_mapping) all_data['BsmtQual'] = all_data['BsmtQual'].map(size_mapping2) all_data['BsmtCond'] = all_data['BsmtCond'].map(size_mapping2) all_data['HeatingQC'] = all_data['HeatingQC'].map(size_mapping) all_data['KitchenQual'] = all_data['KitchenQual'].map(size_mapping) all_data['GarageQual'] = all_data['GarageQual'].map(size_mapping2) all_data['GarageCond'] = all_data['GarageCond'].map(size_mapping2) all_data = pd.get_dummies(all_data)
def param_var(self, alpha): import scipy.stats as stats log_return = np.log1p(self._portfolio) mean = np.mean(log_return) std = np.std(log_return) return np.expm1(-stats.norm(mean, std).ppf(alpha))
def log_alpha(values): min = values.min() alpha = np.log1p(values - min) return alpha / alpha.max() * 0.9 + 0.1
def root_mean_squared_logarithmic_error(true, pred): return np.sqrt( mean_squared_error( np.log1p(true), np.log1p(pred) ) )
y_train = target.iloc[mask[0]].astype('float32').values.reshape(-1, 1) X_test = train.iloc[mask[1]].drop(['tube_assembly_id'], axis=1) y_test = target.iloc[mask[1]].astype('float32').values.reshape(-1, 1) X_train_ = preprocess.fit_transform( X_train ).astype('float32').values X_test_ = preprocess.transform( X_test ).astype('float32').values X_val_ = preprocess.transform( X_val ).astype('float32').values test_ = preprocess.transform( test ).astype('float32').values X_train_, y_train_ = sklearn.utils.shuffle(X_train_, y_train, random_state=random_state) directory = os.path.join(cwd, 'stage2', name, '%i'%(iparam), fold_outer_dir, fold_inner_dir) reg = sklearn.clone(nnet_cater3) # reg.fit(X_train_, y_train_, X_test_, y_test) # reg.fit(X_train_, np.log1p(y_train_), X_test_, np.log1p(y_test)) reg.fit(X_train_, np.log1p(y_train_)) y_pred_test = np.expm1( reg.predict( X_test_ ) ) print('RMSLE (%5s)= %.5f'%( epoch_save_range[-1], root_mean_squared_logarithmic_error( y_test, y_pred_test ) )) print(''.join(['-']*90)) print('refit on train_val set') X_train_val_ = preprocess.fit_transform( X_train_val ) X_val_ = preprocess.transform( X_val ) X_train_val_, y_train_val_ = sklearn.utils.shuffle(X_train_val_, y_train_val, random_state=random_state) directory = os.path.join(cwd, 'stage2', name, '%i', fold_outer_dir, refit_train_val_dir) # os.makedirs(directory) nn_params = {} nn_params.update(core_params) nn_params.update(params)
def plot_series(self, omic1=OMIC.transcriptomic, omic2=OMIC.proteomic, var_names1='auto', var_names2='auto', log1=True, log2=True, fontsize=10, title='', return_figure=False): r""" Plot lines of 2 OMICs sorted in ascending order of `omic1` """ import seaborn as sns ## prepare omic1 = OMIC.parse(omic1) omic2 = OMIC.parse(omic2) omic1_ids = self.get_var_indices(omic1) omic2_ids = self.get_var_indices(omic2) if isinstance(var_names1, string_types) and var_names1 == 'auto': var_names1 = omic1.markers if isinstance(var_names2, string_types) and var_names2 == 'auto': var_names2 = omic2.markers ## filtering variables ids1 = [] ids2 = [] for v1, v2 in zip(var_names1, var_names2): i1 = omic1_ids.get(v1, None) i2 = omic2_ids.get(v2, None) if i1 is not None and i2 is not None: ids1.append(i1) ids2.append(i2) assert len(ids1) > 0, \ (f"No variables found for omic1={omic1} var1={var_names1} " f"and omic2={omic2} var2={var_names2}") x1 = self.get_omic(omic1)[:, ids1] x2 = self.get_omic(omic2)[:, ids2] if log1: x1 = np.log1p(x1) if log2: x2 = np.log1p(x2) names1 = self.get_var_names(omic1)[ids1] names2 = self.get_var_names(omic2)[ids2] n_series = len(names1) ### prepare the plot colors = sns.color_palette(n_colors=2) fig = plt.figure(figsize=(12, n_series * 4)) for idx in range(n_series): y1 = x1[:, idx] y2 = x2[:, idx] order = np.argsort(y1) ax = plt.subplot(n_series, 1, idx + 1) ## the second series ax.plot(y1[order], linewidth=1.8, color=colors[0], label=f"{omic1.name}-{names1[idx]}") ax.set_ylabel(f"{'log' if log1 else 'raw'}-{omic1.name}-{names1[idx]}", color=colors[0]) ax.set_xlabel(f"Cell in ascending order of {omic1.name}") ax.tick_params(axis='y', colors=colors[0], labelcolor=colors[0]) ax.grid(False) ## the second series ax = ax.twinx() ax.plot(y2[order], linestyle='--', alpha=0.88, linewidth=1.2, color=colors[1]) ax.set_ylabel(f"{'log' if log1 else 'raw'}-{omic2.name}-{names2[idx]}", color=colors[1]) ax.tick_params(axis='y', colors=colors[1], labelcolor=colors[1]) ax.grid(False) ### finalize the figure style if len(title) > 0: plt.suptitle(title, fontsize=fontsize + 2) with catch_warnings_ignore(UserWarning): plt.tight_layout(rect=[0., 0.02, 1., 0.98]) if return_figure: return fig return self.add_figure(f'series_{omic1.name}_{omic2.name}', fig)
def excess_return_rate(self): if self._excess_return_rate is None: self._excess_return_rate = np.expm1( np.log1p(self._excess_portfolio).sum()) return self._excess_return_rate
def plot_correlation_scatter(self, omic1=OMIC.transcriptomic, omic2=OMIC.proteomic, var_names1='auto', var_names2='auto', is_marker_pairs=True, log1=True, log2=True, max_scatter_points=200, top=3, bottom=3, title='', return_figure=False): r""" Mapping from omic1 to omic2 Arguments: omic1, omic2 : instance of OMIC. With `omic1` represent the x-axis, and `omic2` represent the y-axis. var_names1 : list of all variable name for `omic1` """ omic1 = OMIC.parse(omic1) omic2 = OMIC.parse(omic2) if isinstance(var_names1, string_types) and var_names1 == 'auto': var_names1 = omic1.markers if isinstance(var_names2, string_types) and var_names2 == 'auto': var_names2 = omic2.markers if var_names1 is None or var_names2 is None: is_marker_pairs = False max_scatter_points = int(max_scatter_points) # get all correlations corr = self.get_correlation(omic1, omic2) corr_map = {(x[0], x[1]): (0 if np.isnan(x[2]) else x[2], 0 if np.isnan(x[3]) else x[3]) for x in corr} om1_names = self.get_var_names(omic1) om2_names = self.get_var_names(omic2) om1_idx = {j: i for i, j in enumerate(om1_names)} om2_idx = {j: i for i, j in enumerate(om2_names)} # extract the data and normalization X1 = self.numpy(omic1) library = np.sum(X1, axis=1, keepdims=True) library = discretizing(library, n_bins=10, strategy='quantile').ravel() if log1: s = np.sum(X1, axis=1, keepdims=True) X1 = np.log1p(X1 / s * np.median(s)) X2 = self.numpy(omic2) if log2: s = np.sum(X2, axis=1, keepdims=True) X2 = np.log1p(X2 / s * np.median(s)) ### getting the marker pairs all_pairs = [] # coordinate marker pairs if is_marker_pairs: pairs = [(i1, i2) for i1, i2 in zip(var_names1, var_names2) if i1 in om1_idx and i2 in om2_idx] var_names1 = [i for i, _ in pairs] var_names2 = [i for _, i in pairs] # filter omic2 if var_names2 is not None: var_names2 = [i for i in var_names2 if i in om2_names] else: var_names2 = om2_names assert len(var_names2) > 0, \ (f"None of the variables {var_names2} is contained in variable list " f"of OMIC {omic2.name}") nrow = len(var_names2) # filter omic1 if var_names1 is not None: var_names1 = [i for i in var_names1 if i in om1_names] ncol = len(var_names1) assert len(var_names1) > 0, \ (f"None of the variables {var_names1} is contained in variable list " f"of OMIC {omic1.name}") for name2 in var_names2: for name1 in var_names1: all_pairs.append((om1_idx[name1], om2_idx[name2])) else: # top and bottom correlation pairs top = int(top) bottom = int(bottom) ncol = top + bottom # pick all top and bottom of omic1 coordinated to omic2 for name in var_names2: i2 = om2_idx[name] pairs = sorted( [[sum(corr_map[(i1, i2)]), i1] for i1 in range(len(om1_names))]) for _, i1 in pairs[-top:][::-1] + pairs[:bottom][::-1]: all_pairs.append((i1, i2)) ### downsampling scatter points if max_scatter_points > 0: ids = np.random.permutation(len(X1))[:max_scatter_points] else: ids = np.arange(len(X1), dtype=np.int32) ### plotting fig = plt.figure(figsize=(ncol * 2, nrow * 2 + 2), dpi=80) for i, pair in enumerate(all_pairs): ax = plt.subplot(nrow, ncol, i + 1) p, s = corr_map[pair] idx1, idx2 = pair x1 = X1[:, idx1] x2 = X2[:, idx2] crow = i // ncol ccol = i % ncol if is_marker_pairs: color = 'salmon' if crow == ccol else 'blue' else: color = 'salmon' if ccol < top else 'blue' vs.plot_scatter(x=x1[ids], y=x2[ids], color=color, ax=ax, size=library[ids], size_range=(6, 30), legend_enable=False, linewidths=0., cbar=False, alpha=0.3) # additional title for first column ax.set_title(f"{om1_names[idx1]}\n$p={p:.2g}$ $s={s:.2g}$", fontsize=8) # beginning of every column if i % ncol == 0: ax.set_ylabel(f"{om2_names[idx2]}", fontsize=8, weight='bold') ## big title plt.suptitle(f"[x:{omic1.name}_y:{omic2.name}]{title}", fontsize=10) fig.tight_layout(rect=[0.0, 0.02, 1.0, 0.98]) ### store and return if return_figure: return fig self.add_figure( f"corr_{omic1.name}{'log' if log1 else 'raw'}_" f"{omic2.name}{'log' if log2 else 'raw'}", fig) return self
#for optimal_model in optimal_models: # print(optimal_model.best_params_) """ Model Selection, Ensembling and Local Validation Result We set parameters here again so that we do not have to rerun the above cell. Hyper parameter tuning is time costly. """ n = round(len(train) * 0.012) X_train = train[n:] X_valid = train[:n] y_train = labels[n:] y_valid = labels[:n] y_train = np.log1p(np.array(y_train, dtype=np.int32)) y_valid = np.log1p(np.array(y_valid, dtype=np.int32)) ## XGBoost Results using optimal parameters selected above params = { "objective": "reg:linear", "booster": "gbtree", "eta": 0.3, "max_depth": 8, "subsample": 0.8, "colsample_bytree": 0.7, "silent": 1, "seed": 3244, "n_estimators": 1000 }
# Isaac Li # 1.25.2018 import time import numpy as np from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error import function train, test = function.read_file() train["血糖"] = np.log1p(train["血糖"]) train, test = function.add_column(train, test, sqrt=True) train, test = function.transform(train, test) print('\n\nStart...') t0, mses = time.time(), [] train_preds, test_preds = np.zeros(train.shape[0]), np.zeros( (test.shape[0], 5)) predictors = [f for f in test.columns if f not in ['血糖']] kf = KFold(n_splits=5, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf.split(train)): print(' .{}/5.'.format(i + 1)) train_feat1, train_feat2 = train.iloc[train_index], train.iloc[test_index] gbm = function.settings.model_xgb.fit(train_feat1[predictors], train_feat1['血糖']) predict = gbm.predict(train_feat2[predictors]) base, power, minimum = 1.7, 1, 7 predict = np.expm1(predict)
def mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average", squared=True): """Mean squared logarithmic error regression loss. Read more in the :ref:`User Guide <mean_squared_log_error>`. Parameters ---------- y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) Estimated target values. sample_weight : array-like of shape (n_samples,), default=None Sample weights. multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ (n_outputs,), default='uniform_average' Defines aggregating of multiple output values. Array-like value defines weights used to average errors. 'raw_values' : Returns a full set of errors when the input is of multioutput format. 'uniform_average' : Errors of all outputs are averaged with uniform weight. squared : bool, default=True If True returns MSLE (mean squared log error) value. If False returns RMSLE (root mean squared log error) value. Returns ------- loss : float or ndarray of floats A non-negative floating point value (the best value is 0.0), or an array of floating point values, one for each individual target. Examples -------- >>> from sklearn.metrics import mean_squared_log_error >>> y_true = [3, 5, 2.5, 7] >>> y_pred = [2.5, 5, 4, 8] >>> mean_squared_log_error(y_true, y_pred) 0.039... >>> mean_squared_log_error(y_true, y_pred, squared=False) 0.199... >>> y_true = [[0.5, 1], [1, 2], [7, 6]] >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]] >>> mean_squared_log_error(y_true, y_pred) 0.044... >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values') array([0.00462428, 0.08377444]) >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7]) 0.060... """ y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) check_consistent_length(y_true, y_pred, sample_weight) if (y_true < 0).any() or (y_pred < 0).any(): raise ValueError("Mean Squared Logarithmic Error cannot be used when " "targets contain negative values.") return mean_squared_error( np.log1p(y_true), np.log1p(y_pred), sample_weight=sample_weight, multioutput=multioutput, squared=squared, )
def load_adnimerge(remove_outliers=False, outcome="ADAS13"): adni = pd.read_csv(_ADNIMERGE_PATH, low_memory=False) baseline = adni.sort_values( by=["PTID", "M"]).groupby("PTID").first().set_index("RID") assert baseline.index.is_unique baseline.loc[:, "PTGENDER"] = baseline.loc[:, "PTGENDER"] #.replace({"Female": 0, "Male": 1}) baseline.loc[:, "EDU-ATTAIN"] = pd.cut( baseline.PTEDUCAT, bins=[0, 12, 16, np.infty], labels=["less_or_equal_12", "12-16", "more_than_16"], right=True, ) LOG.info("\n%s\n", baseline.loc[:, "EDU-ATTAIN"].value_counts()) log_cols = ["PTAU", "TAU"] for col, series in baseline.loc[:, log_cols].iteritems(): baseline.loc[:, col] = np.log1p(series.values) features_sum = {} for col in Volumes.VOLUMES_LR: features_sum[col] = baseline.loc[:, [f"Left-{col}", f"Right-{col}"]].sum( axis=1) # Sum all CC volumes features_sum["CC"] = baseline.loc[:, Volumes.VOLUMES_CC].sum(axis=1) features_sum["Ventricle"] = baseline.loc[:, Volumes.VOLUMES_VENTRICLE].sum( axis=1) for col in Volumes.THICKNESS: features_sum[col] = baseline.loc[:, [f"lh_{col}", f"rh_{col}"]].mean( axis=1) features_sum = pd.DataFrame.from_dict(features_sum) mri_features = pd.concat( (baseline.loc[:, Volumes.VOLUMES_SINGLE], features_sum), axis=1).dropna(axis=0) sd = mri_features.std(ddof=1) assert (sd > 1e-6).all(), "features with low variance:\n{}".format( sd[sd <= 1e-6]) eTIV = mri_features.loc[:, "eTIV"] mri_features.drop("eTIV", axis=1, inplace=True) mri_features.loc[:, Volumes.VOLUMES_LR] = mri_features.loc[:, Volumes. VOLUMES_LR].div( eTIV, axis=0) if remove_outliers: mri_features = drop_outliers(mri_features) is_atn = baseline.loc[:, "ATN_status"].isin( ["A+/T+/N+", "A+/T+/N-", "A+/T-/N-"]) has_outcome = baseline.loc[:, outcome].notnull() positive_outcome = baseline.loc[:, outcome] > 0 LOG.info("Dropping %d with missing or zero %s\n", baseline.shape[0] - positive_outcome.sum(), outcome) data = baseline.loc[is_atn & has_outcome & positive_outcome, :] y = data.loc[:, outcome].round(0).astype(int) LOG.info("\n%s\n", data.loc[:, "ATN_status"].value_counts()) csf_features = ['ABETA', 'TAU', 'PTAU'] demo_features = [ 'IMAGEUID', 'COLPROT', 'SITE', 'AGE', 'PTGENDER', 'PTEDUCAT', 'EDU-ATTAIN' ] features = pd.concat( (data.loc[:, demo_features], data.loc[:, "ATN_status"], data.loc[:, csf_features], mri_features), axis=1, join="inner") assert features.notnull().all().all() assert y.notnull().all() return features, y
params = {"objective": "reg:linear", "booster": "gbtree", "eta": 0.3, "max_depth": 10, "subsample": 0.9, "colsample_bytree": 0.7, "silent": 1, "seed": 1301 } num_boost_round = 300 print("Train a XGBoost model") X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10) print(X_train.columns) y_train = np.log1p(X_train.Sales) y_valid = np.log1p(X_valid.Sales) dtrain = xgb.DMatrix(X_train[features], y_train) dvalid = xgb.DMatrix(X_valid[features], y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \ early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True) print("Validating") yhat = gbm.predict(xgb.DMatrix(X_valid[features])) error = rmspe(X_valid.Sales.values, np.expm1(yhat)) print('RMSPE: {:.6f}'.format(error)) print("Make predictions on the test set") dtest = xgb.DMatrix(test[features])
def _softsign_ildj_before_reduction(self, y): """Inverse log det jacobian, before being reduced.""" return -2. * np.log1p(-np.abs(y))
'TA': 0, 'Fa': 1, 'Po': 1 }) train_new = alldata[alldata['SalePrice'].notnull()] test_new = alldata[alldata['SalePrice'].isnull()] numeric_features = [ f for f in train_new.columns if train_new[f].dtype != object ] skewed = train_new[numeric_features].apply( lambda x: skew(x.dropna().astype(float))) skewed = skewed[skewed > 0.75] skewed = skewed.index train_new[skewed] = np.log1p(train_new[skewed]) test_new[skewed] = np.log1p(test_new[skewed]) del test_new['SalePrice'] scaler = StandardScaler() scaler.fit(train_new[numeric_features]) scaled = scaler.transform(train_new[numeric_features]) for i, col in enumerate(numeric_features): train_new[col] = scaled[:, i] numeric_features.remove('SalePrice') scaled = scaler.fit_transform(test_new[numeric_features]) for i, col in enumerate(numeric_features): test_new[col] = scaled[:, i]
def _logsf(self, x, p): k = floor(x) return k * log1p(-p)
test, sub3, how='left', on='ID', ) from scipy.sparse import csr_matrix, vstack train = train.replace(0, np.nan) test = test.replace(0, np.nan) train = pd.concat((train, test), axis=0, ignore_index=True) test['target'] = 0.0 folds = 5 for fold in range(folds): x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p( train.target.values), test_size=0.20, random_state=fold) params = { 'learning_rate': 0.02, 'max_depth': 7, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'is_training_metric': True, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'seed': fold } model = lgb.train(params,
print() # conserve memory #del train_df #return pandas_frame return train_df # Get data pandas_train = pd.read_csv( 'C:/Users/Soomin/Google Drive/01. MSBA/03. Summer 2017/Machine Learning/Project/House Prices/train.csv' ) pandas_train.shape # Log transform the target for official scoring pandas_train.SalePrice = np.log1p(pandas_train.SalePrice) y = pandas_train.SalePrice # << Preprocessing >> # Lotfrontage temp = pandas_train.groupby('Neighborhood', as_index=False)['LotFrontage'].median() temp = temp.rename(columns={"LotFrontage": "LotFrontage2"}) pandas_train = pd.merge(pandas_train, temp, how='left', on='Neighborhood') pandas_train['LotFrontage'][pandas_train['LotFrontage'].isnull( )] = pandas_train['LotFrontage2'][pandas_train['LotFrontage'].isnull()] pandas_train = pandas_train.drop('LotFrontage2', axis=1) # Alley pandas_train["Alley"].fillna("None", inplace=True)
def logsubexp(x, y): assert all(x >= y), 'cannot take log of negative number %s - %s' % (str(x), str(y)) return x + log1p(-exp(y - x))
def read_floor_ys(self, output_dim, include_floor_number=None, only_biggest_floor=False, sorted_xs=False, upscale_xs_factor=1, xs_from_biggest_floor=False, floor_always_positive=False, verbose=0): """ :param int output_dim: :param bool include_floor_number: :param bool only_biggest_floor: :param bool sorted_xs: this is useful for plotting (dump-dataset --type plot). Otherwise you probably do not want this, because if your output_dim < len(xs), you might miss important information. Except with upscale_xs_factor, where this again probably makes sense. :param float|int upscale_xs_factor: :param bool xs_from_biggest_floor: False is old behavior, but probably you want to use this (only relevant if not only_biggest_floor) :param bool floor_always_positive: :param int verbose: :return: float values in [-1,1], shape (time,dim) :rtype: numpy.ndarray """ if only_biggest_floor: assert include_floor_number in (None, False) include_floor_number = False if include_floor_number is None: include_floor_number = True floor_multipliers = [] floor_xs = [] floor_xs_upscaled = [] while True: name, channel, data = self.read_entry() if name == "floor1_unpack multiplier": assert len(data) == 1 floor_multipliers.append(data[0]) if name == "floor1_unpack xs": if sorted_xs: data = sorted(data) floor_xs.append(numpy.array(data)) if upscale_xs_factor != 1: import scipy.ndimage data_upscaled = scipy.ndimage.zoom(numpy.array( data, dtype="float32"), zoom=upscale_xs_factor, order=1, mode="nearest") data_upscaled = numpy.round(data_upscaled).astype("int32") assert data_upscaled.shape[0] == len( data) * upscale_xs_factor floor_xs_upscaled.append(data_upscaled) if name == "finish_setup": break assert len(floor_multipliers) == len(floor_xs) > 0 res_float = numpy.zeros((500, output_dim), dtype="float32") num_floors = len(floor_xs) biggest_floor_idx = max(range(num_floors), key=lambda i: len(floor_xs[i])) dim = output_dim if include_floor_number: dim -= 1 if verbose: if verbose >= 5: for i in range(num_floors): print( "Floor %i/%i, multiplier %i, xs: %r" % (i + 1, num_floors, floor_multipliers[i], floor_xs[i])) print( "Biggest floor: %i, len(xs) = %i" % (biggest_floor_idx + 1, len(floor_xs[biggest_floor_idx]))) if dim > len(floor_xs[biggest_floor_idx]): print("Warning: Dim = %i > len(biggest floor xs) = %i" % (dim, len(floor_xs[biggest_floor_idx]))) recent_floor_number = None frame_num = 0 offset_dim = 0 while True: try: name, channel, data = self.read_entry() except EOFError: break if name == "floor_number": recent_floor_number = data[0] assert 0 <= recent_floor_number < len(floor_xs) xs = None factor = None if recent_floor_number is not None: if only_biggest_floor and recent_floor_number != biggest_floor_idx: continue xs = floor_xs_upscaled if floor_xs_upscaled else floor_xs if xs_from_biggest_floor: xs = xs[biggest_floor_idx] if biggest_floor_idx != recent_floor_number: max_big_x = max(floor_xs[biggest_floor_idx]) max_cur_x = max(floor_xs[recent_floor_number]) factor = int(round( float(max_big_x) / float(max_cur_x))) xs = xs // factor xs = numpy.clip(xs, 0, len(data) - 1) else: xs = xs[recent_floor_number] if name in {"floor1 ys", "floor1 final_ys"}: assert recent_floor_number is not None if only_biggest_floor and recent_floor_number != biggest_floor_idx: continue assert len(data) == len(floor_xs[recent_floor_number]) # values [0..255] data_int = numpy.array( data[:dim], dtype="float32") * floor_multipliers[recent_floor_number] if floor_always_positive: # values [0,1.0] data_float = data_int.astype("float32") / 255.0 else: # values [-1.0,1.0] data_float = (data_int.astype("float32") - 127.5) / 127.5 frame_float = numpy.zeros((output_dim, ), dtype="float32") offset_dim = 0 if include_floor_number: frame_float[0] = (recent_floor_number + 1.0) / num_floors - 0.5 # (-0.5,0.5) offset_dim = 1 frame_float[offset_dim:offset_dim + data_float.shape[0]] = data_float if frame_num >= res_float.shape[0]: res_float = numpy.concatenate( [res_float, numpy.zeros_like(res_float)], axis=0) res_float[frame_num] = frame_float frame_num += 1 elif name == "floor1 floor": assert recent_floor_number is not None data = numpy.array(data)[xs] # values [0..255] (data is already with multiplier) data_int = numpy.array(data[:dim], dtype="float32") if floor_always_positive: # values [0,1.0] data_float = data_int.astype("float32") / 255.0 else: # values [-1.0,1.0] data_float = (data_int.astype("float32") - 127.5) / 127.5 frame_float = numpy.zeros((output_dim, ), dtype="float32") offset_dim = 0 if include_floor_number: frame_float[0] = (recent_floor_number + 1.0) / num_floors - 0.5 # (-0.5,0.5) offset_dim = 1 frame_float[offset_dim:offset_dim + data_float.shape[0]] = data_float offset_dim += data_float.shape[0] if frame_num >= res_float.shape[0]: res_float = numpy.concatenate( [res_float, numpy.zeros_like(res_float)], axis=0) res_float[frame_num] = frame_float frame_num += 1 elif name == "after_residue": assert recent_floor_number is not None if offset_dim == 0: # no floor before, can happen for some continue assert frame_num > 0 # had floor before assert output_dim >= offset_dim # Could use xs, but instead, this seems more interesting. idxs = numpy.arange(start=0, stop=len(data), step=1) if factor: idxs = idxs // factor # Some hardcoded hyper params here... data = numpy.array(data)[idxs] data = numpy.log1p(numpy.abs(data)) * 0.1 import scipy.ndimage data = scipy.ndimage.zoom(data, zoom=0.5) data = data[:output_dim - offset_dim] res_float[frame_num - 1, offset_dim:offset_dim + data.shape[0]] = data offset_dim = 0 return res_float[:frame_num]
def log1p(obj): obj = to_dual(obj) return Dual(np.log1p(obj.re), obj.im / (1 + obj.re))
def normalizing(frame): normalised_dset = np.log1p(frame) return normalised_dset
def read_residue_ys(self, output_dim, scale=1.0, clip_abs_max=None, log1p_abs_space=False, sorted_xs=False, ignore_xs=False, floor_base_factor=1): """ :param int output_dim: :param float scale: :param float clip_abs_max: :param bool log1p_abs_space: :param float floor_base_factor: :param bool sorted_xs: this is useful for plotting (dump-dataset --type plot). Otherwise you probably do not want this, because if your output_dim < len(xs), you might miss important information. :param bool ignore_xs: :return: float values in [-1,1], shape (time,dim) :rtype: numpy.ndarray """ floor_multipliers = [] floor_xs = [] while True: name, channel, data = self.read_entry() if name == "floor1_unpack multiplier": assert len(data) == 1 floor_multipliers.append(data[0]) if name == "floor1_unpack xs": if sorted_xs: data = sorted(data) floor_xs.append(numpy.array(data, dtype="int32")) if name == "finish_setup": break assert len(floor_multipliers) == len(floor_xs) > 0 res_float = numpy.zeros((500, output_dim), dtype="float32") num_floors = len(floor_xs) biggest_floor_idx = max(range(num_floors), key=lambda i: len(floor_xs[i])) recent_floor_number = None frame_num = 0 floor_base = None while True: try: name, channel, data = self.read_entry() except EOFError: break if name == "floor_number": recent_floor_number = data[0] assert 0 <= recent_floor_number < len(floor_xs) idxs = None if recent_floor_number is not None: if ignore_xs: idxs = numpy.arange(start=0, stop=len(data), step=1)[:output_dim] else: idxs = floor_xs[recent_floor_number][:output_dim] # We might be just at the edge (e.g. idx==512 and len(data)==512). idxs = numpy.clip(idxs, 0, len(data) - 1) if name == "floor1 floor": assert recent_floor_number is not None if recent_floor_number != biggest_floor_idx: continue data = numpy.array(data)[idxs] # values [0..255] (data is already with multiplier) data_int = numpy.array(data, dtype="float32") # values [0.0,1.0] data_float = (data_int.astype("float32")) / 255.0 floor_base = data_float if name == "after_residue": assert recent_floor_number is not None if recent_floor_number != biggest_floor_idx: continue data_float = numpy.array(data, dtype="float32") selected_data = data_float[idxs] if not ignore_xs: assert len(selected_data) == len( floor_xs[recent_floor_number]) assert isinstance(selected_data, numpy.ndarray) if log1p_abs_space: selected_data = numpy.log1p(numpy.abs(selected_data)) if floor_base is not None: if log1p_abs_space: selected_data += floor_base * floor_base_factor else: selected_data *= numpy.exp( (floor_base - 1.0) * floor_base_factor) if scale != 1: selected_data *= scale if clip_abs_max is not None and clip_abs_max > 0: selected_data = numpy.clip(selected_data, -clip_abs_max, clip_abs_max) frame_float = numpy.zeros((output_dim, ), dtype="float32") frame_float[0:selected_data.shape[0]] = selected_data if frame_num >= res_float.shape[0]: res_float = numpy.concatenate( [res_float, numpy.zeros_like(res_float)], axis=0) res_float[frame_num] = frame_float frame_num += 1 return res_float[:frame_num]