def mean_absolute_error(y_true, y_pred): """ Mean absolute error and its standard deviation. If you need only mean absolute error, use :func:`sklearn.metrics.mean_absolute_error` Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores Returns ------- mean : float mean of squared errors stdev : float standard deviation of squared errors """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calculate errors errs = np.abs(y_true - y_pred) mean = np.nanmean(errs) stdev = np.nanstd(errs) return mean, stdev
def set_x_d(self, treatment_var): """ Function that assigns the role for the treatment variables in the multiple-treatment case. Parameters ---------- treatment_var : str Active treatment variable that will be set to d. """ if not isinstance(treatment_var, str): raise TypeError( 'treatment_var must be of str type. ' f'{str(treatment_var)} of type {str(type(treatment_var))} was passed.' ) if treatment_var not in self.d_cols: raise ValueError('Invalid treatment_var. ' f'{treatment_var} is not in d_cols.') if self.use_other_treat_as_covariate: # note that the following line needs to be adapted in case an intersection of x_cols and d_cols as allowed # (see https://github.com/DoubleML/doubleml-for-py/issues/83) xd_list = self.x_cols + self.d_cols xd_list.remove(treatment_var) else: xd_list = self.x_cols assert_all_finite(self.data.loc[:, treatment_var]) if self.force_all_x_finite: assert_all_finite(self.data.loc[:, xd_list], allow_nan=self.force_all_x_finite == 'allow-nan') self._d = self.data.loc[:, treatment_var] self._X = self.data.loc[:, xd_list]
def fit(self, X, y=None): """ Saves the `r` vector (log-ratio vector) that will be applied during transformation Parameters ---------- X: array-like of shape = [n_samples, n_features] Feature matrix representing the vectorized input """ # checks X = check_array( X.toarray() if isinstance(X, sparse.csr.csr_matrix) else X) if not isinstance(X, np.ndarray) and not isinstance( X, sparse.csr.csr_matrix): raise TypeError( "data type of X must be dense or sparse array; type = {}". format(type(X))) assert_all_finite(X) # get type of feature_matrix fm_type = None if isinstance(X, np.ndarray): fm_type = "dense" elif isinstance(X, sparse.csr.csr_matrix): fm_type = "sparse" # get p, not_p _p, _not_p = self._get_p_not_p(X) # get r self._r = self._get_r(_p, _not_p) # ensure is_fitted self.X_ = X self.y_ = y return self
def score_predictor_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * mean absolute error * root mean squared error * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'mean absolute error': skm.mean_absolute_error(y_true, y_pred), 'root mean squared error': np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)), 'n_samples': y_true.size, 'true': { 'mean': np.mean(y_true), 'stdev': np.std(y_true) }, 'predicted': { 'mean': np.mean(y_pred), 'stdev': np.std(y_pred) } } # display statistics if disp: print(json.dumps(stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def _binary_clf_curve(y_true, y_score): check_consistent_length(y_true, y_score, None) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) assert_all_finite(y_true) assert_all_finite(y_score) # make y_true a boolean vector y_true = (y_true == 1) # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. distinct_value_indices = np.where(np.diff(y_score))[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true)[threshold_idxs] fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs]
def item_finder_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * AUC * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) if not is_binary_score(y_true): raise ValueError('True scores must be binary') y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'n_samples': y_true.size, 'true': { 'mean': np.mean(y_true), 'stdev': np.std(y_true) }, 'predicted': { 'mean': np.mean(y_pred), 'stdev': np.std(y_pred) } } # statistics at least 0 and 1 must be contained in a score array if is_binary_score(y_true, allow_uniform=False): stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred) # display statistics if disp: print(json.dumps(stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def fit(self, X, y=None): """Compute the Deterministic Shared Response Model Parameters ---------- X : list of 2D arrays, element i has shape=[voxels_i, samples] Each element in the list contains the fMRI data of one subject. y : not used """ logger.info('Starting Deterministic SRM') # Check the number of subjects if len(X) <= 1: raise ValueError("There are not enough subjects " "({0:d}) to train the model.".format(len(X))) # Check for input data sizes if X[0].shape[1] < self.features: raise ValueError( "There are not enough samples to train the model with " "{0:d} features.".format(self.features)) # Check if all subjects have same number of TRs number_trs = X[0].shape[1] number_subjects = len(X) for subject in range(number_subjects): assert_all_finite(X[subject]) if X[subject].shape[1] != number_trs: raise ValueError("Different number of samples between subjects" ".") # Run SRM self.w_, self.s_ = self._srm(X) return self
def _set_y_z(self): assert_all_finite(self.data.loc[:, self.y_col]) self._y = self.data.loc[:, self.y_col] if self.z_cols is None: self._z = None else: assert_all_finite(self.data.loc[:, self.z_cols]) self._z = self.data.loc[:, self.z_cols]
def transform(self, df, *_): X = df[df['Age'].isnull()] X = X.drop(['Age'], axis=1) y = pd.Series(self._model.predict(X)) y.index = X.index df.loc[y.index, 'Age'] = y assert_all_finite(df) return df
def test_int_overflow_mutual_info_score(): # Test overflow in mutual_info_classif x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) assert_all_finite(mutual_info_score(x.ravel(), y.ravel(), log_base='e'))
def test_int_overflow_mutual_info_score(): # Test overflow in mutual_info_classif x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) assert_all_finite(mutual_info_score(x.ravel(), y.ravel()))
def test_int_overflow_mutual_info_fowlkes_mallows_score(): # Test overflow in mutual_info_classif and fowlkes_mallows_score x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) assert_all_finite(mutual_info_score(x, y)) assert_all_finite(fowlkes_mallows_score(x, y))
def _validate_mcmc_fit_input(X_train, y_train, X_test): check_consistent_length(X_train, y_train) assert_all_finite(y_train) y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) assert X_train.shape[1] == X_test.shape[1] X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, order="F") X_test = check_array(X_test, accept_sparse="csc", dtype=np.float64, order="F") return X_train, y_train, X_test
def fit(self, X: pd.DataFrame, y=None): cols = self.cols or X.columns.tolist() if not self.fill: assert_all_finite(X, allow_nan=False) self.categories_ = dict() for col in cols: cutoff = _encode_python(X[col].fillna('_MISSING').astype(str)) self.categories_[col] = cutoff return self
def entropy(*args): xy = zip(*args) # probs proba = [ float(xy.count(c)) / len(xy) for c in dict.fromkeys(list(xy)) ] safe_asarray(xy) #very pythonic list comprehension # the follwoing line is just a list comprehnsion with x = # x[numpy.isfinite(x)] having ability to filter crap out # x = x[numpy.logical_not(numpy.isnan(x))] entropy = -np.sum([ ((p * np.log2(p)) , 0 ) [ math.isnan(p * np.log2(p)) or math.isinf(p * np.log2(p)) ] for p in proba ]) assert_all_finite(entropy) return entropy
def fit(self, X, y=None): """Compute the probabilistic Shared Response Model Parameters ---------- X : list of 2D arrays, element i has shape=[voxels_i, samples] Each element in the list contains the fMRI data of one subject. y : not used """ logger.info('Starting Probabilistic SRM') # Check the number of subjects if len(X) <= 1: raise ValueError("There are not enough subjects " "({0:d}) to train the model.".format(len(X))) # Check for input data sizes number_subjects = len(X) number_subjects_vec = self.comm.allgather(number_subjects) for rank in range(self.comm.Get_size()): if number_subjects_vec[rank] != number_subjects: raise ValueError( "Not all ranks have same number of subjects") # Collect size information shape0 = np.zeros((number_subjects,), dtype=np.int) shape1 = np.zeros((number_subjects,), dtype=np.int) for subject in range(number_subjects): if X[subject] is not None: assert_all_finite(X[subject]) shape0[subject] = X[subject].shape[0] shape1[subject] = X[subject].shape[1] shape0 = self.comm.allreduce(shape0, op=MPI.SUM) shape1 = self.comm.allreduce(shape1, op=MPI.SUM) # Check if all subjects have same number of TRs number_trs = np.min(shape1) for subject in range(number_subjects): if shape1[subject] < self.features: raise ValueError( "There are not enough samples to train the model with " "{0:d} features.".format(self.features)) if shape1[subject] != number_trs: raise ValueError("Different number of samples between subjects" ".") # Run SRM self.sigma_s_, self.w_, self.mu_, self.rho2_, self.s_ = self._srm(X) return self
def fit(self, X, y=None): """Compute the probabilistic Shared Response Model Parameters ---------- X : list of 2D arrays, element i has shape=[voxels_i, samples] Each element in the list contains the fMRI data of one subject. y : not used """ logger.info('Starting Probabilistic SRM') # Check the number of subjects if len(X) <= 1: raise ValueError("There are not enough subjects " "({0:d}) to train the model.".format(len(X))) # Check for input data sizes number_subjects = len(X) number_subjects_vec = self.comm.allgather(number_subjects) for rank in range(self.comm.Get_size()): if number_subjects_vec[rank] != number_subjects: raise ValueError("Not all ranks have same number of subjects") # Collect size information shape0 = np.zeros((number_subjects, ), dtype=np.int) shape1 = np.zeros((number_subjects, ), dtype=np.int) for subject in range(number_subjects): if X[subject] is not None: assert_all_finite(X[subject]) shape0[subject] = X[subject].shape[0] shape1[subject] = X[subject].shape[1] shape0 = self.comm.allreduce(shape0, op=MPI.SUM) shape1 = self.comm.allreduce(shape1, op=MPI.SUM) # Check if all subjects have same number of TRs number_trs = np.min(shape1) for subject in range(number_subjects): if shape1[subject] < self.features: raise ValueError( "There are not enough samples to train the model with " "{0:d} features.".format(self.features)) if shape1[subject] != number_trs: raise ValueError("Different number of samples between subjects" ".") # Run SRM self.sigma_s_, self.w_, self.mu_, self.rho2_, self.s_ = self._srm(X) return self
def item_finder_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * AUC * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) if not is_binary_score(y_true): raise ValueError('True scores must be binary') y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'n_samples': y_true.size, 'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)}, 'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}} # statistics at least 0 and 1 must be contained in a score array if is_binary_score(y_true, allow_uniform=False): stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred) # display statistics if disp: print( json.dumps( stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def item_finder_statistics(y_true, y_pred): """ Full Statistics of prediction performance * n_samples * mean_absolute_error: mean, stdev * mean_squared_error: mean, rmse, stdev * predicted: mean, stdev * true: mean, stdev Parameters ---------- y_true : array, shape=(n_samples,) Ground truth scores y_pred : array, shape=(n_samples,) Predicted scores Returns ------- stats : dict Full statistics of prediction performance """ # check inputs assert_all_finite(y_true) if not is_binary_score(y_true): raise ValueError('True scores must be binary') y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = {} # dataset size stats['n_samples'] = y_true.size # descriptive statistics of ground truth scores stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)} # descriptive statistics of ground predicted scores stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)} # statistics at least 0 and 1 must be contained in a score array if is_binary_score(y_true, allow_uniform=False): # AUC (area under the curve) stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred) return stats
def score_predictor_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * mean absolute error * root mean squared error * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'mean absolute error': skm.mean_absolute_error(y_true, y_pred), 'root mean squared error': np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)), 'n_samples': y_true.size, 'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)}, 'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}} # display statistics if disp: print(json.dumps( stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def gpflow_predict(model, Xin): Xin = check_array(Xin, copy=False, warn_on_dtype=True, dtype=FLOAT_DTYPES) fmean, fvar, _, _, _ = model._build_predict(Xin) # pylint: disable=protected-access y_mean_var = model.likelihood.predict_mean_and_var(fmean, fvar) y_mean = y_mean_var[0] y_var = y_mean_var[1] y_std = tf.sqrt(y_var) session = model.enquire_session(session=None) with session.as_default(): y_mean_value = session.run(y_mean) y_std_value = session.run(y_std) assert_all_finite(y_mean_value) assert_all_finite(y_std_value) return GPRResult(y_mean_value, y_std_value)
def transform(self, df, *_): assert_all_finite(df) interaction = {} for c0 in df: for c1 in df: interaction['{}*{}'.format(c0, c1)] = df[c0] * df[c1] if c0 != c1: interaction['{}-{}'.format(c0, c1)] = df[c0] - df[c1] interaction['{}/{}'.format( c0, c1)] = df[c0] / df[c1].replace(0, 1) df_interaction = pd.DataFrame(interaction) df_interaction.index = df.index df = pd.concat([df, df_interaction], axis=1) assert_all_finite(df) return df
def fit_transform(self, X, y=None): """Runs fit() and transform() together.""" if np.any(y): X, y = check_X_y( X.toarray() if isinstance(X, sparse.csr.csr_matrix) else X, y) else: X = check_array( X.toarray() if isinstance(X, sparse.csr.csr_matrix) else X) assert_all_finite(X) if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y).transform(X)
def predict(self, X): """Predict using the factorization machine Parameters ---------- X : sparse matrix, shape = [n_samples, n_features] Returns ------- array, shape = [n_samples] Predicted target values per element in X. """ assert_all_finite(X) return self.fm.predict(X)
def tf_optimize(model, Xnew_arr, learning_rate=0.01, maxiter=100, ucb_beta=3., active_dims=None, bounds=None): Xnew_arr = check_array(Xnew_arr, copy=False, warn_on_dtype=True, dtype=FLOAT_DTYPES) Xnew = tf.Variable(Xnew_arr, name='Xnew', dtype=settings.float_type) if bounds is None: lower_bound = tf.constant(-np.infty, dtype=settings.float_type) upper_bound = tf.constant(np.infty, dtype=settings.float_type) else: lower_bound = tf.constant(bounds[0], dtype=settings.float_type) upper_bound = tf.constant(bounds[1], dtype=settings.float_type) Xnew_bounded = tf.minimum(tf.maximum(Xnew, lower_bound), upper_bound) if active_dims: indices = [] updates = [] n_rows = Xnew_arr.shape[0] for c in active_dims: for r in range(n_rows): indices.append([r, c]) updates.append(Xnew_bounded[r, c]) part_X = tf.scatter_nd(indices, updates, Xnew_arr.shape) Xin = part_X + tf.stop_gradient(-part_X + Xnew_bounded) else: Xin = Xnew_bounded beta_t = tf.constant(ucb_beta, name='ucb_beta', dtype=settings.float_type) y_mean_var = model.likelihood.predict_mean_and_var( *model._build_predict(Xin)) loss = tf.subtract(y_mean_var[0], tf.multiply(beta_t, y_mean_var[1]), name='loss_fn') opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-6) train_op = opt.minimize(loss) variables = opt.variables() init_op = tf.variables_initializer([Xnew] + variables) session = model.enquire_session(session=None) with session.as_default(): session.run(init_op) for i in range(maxiter): session.run(train_op) Xnew_value = session.run(Xnew_bounded) y_mean_value, y_var_value = session.run(y_mean_var) loss_value = session.run(loss) assert_all_finite(Xnew_value) assert_all_finite(y_mean_value) assert_all_finite(y_var_value) assert_all_finite(loss_value) return GPRGDResult(y_mean_value, y_var_value, loss_value, Xnew_value)
def score_histogram(x, score_domain=(1, 5, 1)): """ Histogram of scores Parameters ---------- x : array, shape=(n_samples), dtype=float or int A set of scores score_domain : array, shape=(3,) OR int, optional Domain of scores, represented by a triple of the minimum, the maximum, and strides of the score, if array-like. The range between the minimum and the maximum are divided into the specified number of bins, if int. default=(1, 5, 1). Returns ------- hist : array_like, shape=(n_score_levels,) The number of data in each bin scores : array_like, shape=(n_score_levels + 1,) sequences of possible scores """ # check inputs assert_all_finite(x) if isinstance(score_domain, np.integer): bins = score_domain else: assert_all_finite(score_domain) bins = generate_score_bins(score_domain) # making histogram hist, bins = np.histogram(x, bins=bins) # candidates of possible scores if isinstance(score_domain, np.integer): scores = (bins[1:] + bins[:-1]) / 2 else: scores = np.hstack([ np.arange(score_domain[0], score_domain[1], score_domain[2], dtype=float), score_domain[1] ]) # return statistics return hist, scores
def fit(self, X: pd.DataFrame, y): # store a mapping from feature value to woe value self.mapping_ = dict() cols = self.cols or X.columns.tolist() conditional_cols = self.conditional_cols or [] for col in cols: if col not in conditional_cols: # missing value can not be handled by WoeEncoder # since np.nan will fail the equality check assert_all_finite(X[col]) woe_value = woe(X[col], y, conditional=col in conditional_cols, na_values=self.na_values) self.mapping_[col] = woe_value return self
def score_histogram(x, score_domain=(1, 5, 1)): """ Histogram of scores Parameters ---------- x : array, shape=(n_samples), dtype=float or int A set of scores score_domain : array, shape=(3,) OR int, optional Domain of scores, represented by a triple of the minimum, the maximum, and strides of the score, if array-like. The range between the minimum and the maximum are divided into the specified number of bins, if int. default=(1, 5, 1). Returns ------- hist : array_like, shape=(n_score_levels,) The number of data in each bin scores : array_like, shape=(n_score_levels + 1,) sequences of possible scores """ # check inputs assert_all_finite(x) if isinstance(score_domain, np.integer): bins = score_domain else: assert_all_finite(score_domain) bins = generate_score_bins(score_domain) # making histogram hist, bins = np.histogram(x, bins=bins) # candidates of possible scores if isinstance(score_domain, np.integer): scores = (bins[1:] + bins[:-1]) / 2 else: scores = np.hstack( [np.arange(score_domain[0], score_domain[1], score_domain[2], dtype=float), score_domain[1]]) # return statistics return hist, scores
def transform(self, X: pd.DataFrame, y=None): check_is_fitted(self, 'categories_') x = X.copy() for col in self.cols or X.columns: if col not in x: msg = 'Column {} is not found in the DataFrame'.format(col) if self.error == 'raise': raise ValueError(msg) if self.error == 'warn': warnings.warn(msg) if not self.fill: assert_all_finite(x[col], allow_nan=False) else: x[col] = x[col].fillna('_MISSING').astype(str) cutoff = self.categories_[col] _, x[col] = _encode_python(x[col], uniques=cutoff, encode=True, unseen=self.unseen) return x
def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) prediction_dim = 4 loss = _LOSSES['categorical_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float32) baseline_prediction = loss.get_baseline_prediction( y_train, prediction_dim) assert_all_finite(baseline_prediction) # Same logic as for above test. Here inverse_link_function = softmax and # link_function = log y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) assert baseline_prediction.shape == (1, prediction_dim) for k in range(prediction_dim): p = (y_train == k).mean() assert_almost_equal(baseline_prediction[:, k], np.log(p))
def test_baseline_poisson(): rng = np.random.RandomState(0) loss = _LOSSES["poisson"](sample_weight=None) y_train = rng.poisson(size=100).astype(np.float64) # Sanity check, make sure at least one sample is non-zero so we don't take # log(0) assert y_train.sum() > 0 baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert np.isscalar(baseline_prediction) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) # Make sure baseline prediction produces the log of the mean of all targets assert_almost_equal(np.log(y_train.mean()), baseline_prediction) # Test baseline for y_true = 0 y_train.fill(0.0) baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert_all_finite(baseline_prediction)
def on_next(obj): nonlocal self X = obj[["p_log", "q_log"]] check_is_fitted(self, ["is_fitted"]) utils.assert_all_finite(X) X = utils.as_float_array(X) self._update_clustering(X) obj_2 = { "i_min": np.min(obj[["i"]]), "i_max": np.max(obj[["i"]]), "cluster": self.clustering, "X": X } if "start_time" in obj.keys(): obj_2["start_time"] = obj.iloc[-1]["start_time"] observer.on_next(obj_2)
def fit(self, X): """Compute the Robust Shared Response Model Parameters ---------- X : list of 2D arrays, element i has shape=[voxels_i, timepoints] Each element in the list contains the fMRI data of one subject. """ logger.info('Starting RSRM') # Check that the regularizer value is positive if 0.0 >= self.lam: raise ValueError("Gamma parameter should be positive.") # Check the number of subjects if len(X) <= 1: raise ValueError("There are not enough subjects in the input " "data to train the model.") # Check for input data sizes if X[0].shape[1] < self.features: raise ValueError( "There are not enough timepoints to train the model with " "{0:d} features.".format(self.features)) # Check if all subjects have same number of TRs for alignment number_trs = X[0].shape[1] number_subjects = len(X) for subject in range(number_subjects): assert_all_finite(X[subject]) if X[subject].shape[1] != number_trs: raise ValueError("Different number of alignment timepoints " "between subjects.") # Create a new random state self.random_state_ = np.random.RandomState(self.rand_seed) # Run RSRM self.w_, self.r_, self.s_ = self._rsrm(X) return self
def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) prediction_dim = 4 loss = _LOSSES['categorical_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) # Same logic as for above test. Here inverse_link_function = softmax and # link_function = log y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) assert baseline_prediction.shape == (prediction_dim, 1) for k in range(prediction_dim): p = (y_train == k).mean() assert np.allclose(baseline_prediction[k, :], np.log(p))
def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) prediction_dim = 4 loss = _LOSSES["categorical_crossentropy"](sample_weight=None) for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction( y_train, None, prediction_dim) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) # Same logic as for above test. Here inverse_link_function = softmax and # link_function = log y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) baseline_prediction = loss.get_baseline_prediction(y_train, None, prediction_dim) assert baseline_prediction.shape == (prediction_dim, 1) for k in range(prediction_dim): p = (y_train == k).mean() assert np.allclose(baseline_prediction[k, :], np.log(p))
def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) loss = _LOSSES['binary_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float32) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert_all_finite(baseline_prediction) assert_almost_equal(loss.inverse_link_function(baseline_prediction), y_train[0]) # Make sure baseline prediction is equal to link_function(p), where p # is the proba of the positive class. We want predict_proba() to return p, # and by definition # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) y_train = rng.randint(0, 2, size=100).astype(np.float32) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert baseline_prediction.shape == tuple() # scalar p = y_train.mean() assert_almost_equal(baseline_prediction, np.log(p / (1 - p)))
def test_multinomial_loss_fit_intercept_only(): """Test that fit_intercept_only returns the mean functional for CCE.""" rng = np.random.RandomState(0) n_classes = 4 loss = HalfMultinomialLoss(n_classes=n_classes) # Same logic as test_specific_fit_intercept_only. Here inverse link # function = softmax and link function = log - symmetry term. y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64) baseline_prediction = loss.fit_intercept_only(y_true=y_train) assert baseline_prediction.shape == (n_classes, ) p = np.zeros(n_classes, dtype=y_train.dtype) for k in range(n_classes): p[k] = (y_train == k).mean() assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p))) assert_allclose(baseline_prediction[None, :], loss.link.link(p[None, :])) for y_train in (np.zeros(shape=10), np.ones(shape=10)): y_train = y_train.astype(np.float64) baseline_prediction = loss.fit_intercept_only(y_true=y_train) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction)
def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) loss = _LOSSES['binary_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert_all_finite(baseline_prediction) assert np.allclose(loss.inverse_link_function(baseline_prediction), y_train[0]) # Make sure baseline prediction is equal to link_function(p), where p # is the proba of the positive class. We want predict_proba() to return p, # and by definition # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) y_train = rng.randint(0, 2, size=100).astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype p = y_train.mean() assert np.allclose(baseline_prediction, np.log(p / (1 - p)))
def mean_squared_error(y_true, y_pred): """ Root mean square error, mean square error, and its standard deviation. If you need only RMSE, use :func:`sklearn.metrics.mean_absolute_error` Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores Returns ------- rmse : float root mean squared error mean : float mean of absolute errors stdev : float standard deviation of absolute errors """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calculate errors errs = (y_true - y_pred) ** 2 mean = np.nanmean(errs) stdev = np.nanstd(errs) rmse = np.sqrt(np.maximum(mean, 0.)) return rmse, mean, stdev
def mean_absolute_percentage_error(y_true, y_pred): """ Use of this metric is not recommended; for illustration only. See other regression metrics on sklearn docs: http://scikit-learn.org/stable/modules/classes.html#regression-metrics Use like any other metric >>> y_true = [3, -0.5, 2, 7]; y_pred = [2.5, -0.3, 2, 8] >>> mean_absolute_percentage_error(y_true, y_pred) Out[]: 24.791666666666668 """ y_true = np.asanyarray(y_true) y_pred = np.asanyarray(y_pred) assert_all_finite(y_true) assert_all_finite(y_pred) #Filter zero values in y_true sel = (y_true != 0) y_true = y_true[sel] y_pred = y_pred[sel] ## Note: does not handle mix 1d representation #if _is_1d(y_true): # y_true, y_pred = _check_1d_array(y_true, y_pred) # return np.abs((y_true - y_pred) / y_true.astype(np.float32)).sum()/float(district_num * dateslot_num) return np.mean(np.abs((y_true - y_pred) / y_true.astype(np.float32)))
def fit(self, X, pairs): """ Fit model with specified loss. Parameters ---------- X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_compares, 2) Each row `i` defines a pair of samples such that the first returns a high value then the second FM(X[i,0]) > FM(X[i, 1]). """ X = X.T X = check_array(X, accept_sparse="csc", dtype=np.float64) assert_all_finite(pairs) pairs = pairs.astype(np.float64) # check that pairs contain no real values assert_array_equal(pairs, pairs.astype(np.int32)) assert pairs.max() <= X.shape[1] assert pairs.min() >= 0 self.w0_, self.w_, self.V_ = ffm.ffm_fit_sgd_bpr(self, X, pairs) return self
def fit(self, X, y=None): """Compute the probabilistic Shared Response Model Parameters ---------- X : list of 2D arrays, element i has shape=[voxels_i, samples] Each element in the list contains the fMRI data of one subject. y : not used """ if self.verbose: print('Running Probabilistic SRM') # noqa FIXME # Check the number of subjects if len(X) <= 1: raise ValueError("There are not enough subjects " "({0:d}) to train the model.".format(len(X))) # Check for input data sizes if X[0].shape[1] < self.features: raise ValueError( "There are not enough samples to train the model with " "{0:d} features.".format(self.features)) # Check if all subjects have same number of TRs number_trs = X[0].shape[1] number_subjects = len(X) for subject in range(number_subjects): assert_all_finite(X[subject]) if X[subject].shape[1] != number_trs: raise ValueError( "Different number of samples between subjects.") # Run SRM self.sigma_s_, self.w_, self.mu_, self.rho2_, self.s_ = self._srm(X) return self
def testLearner(d_dfData, s_symbol, d_dfFeatures, d_dfClass, b_scaling, b_pca, fc_learnerFactory, i_trainPeriod, b_Plot = False): t1 = datetime.now() df_data = d_dfFeatures[s_symbol] #print df_data.to_csv() df_classData = d_dfClass[s_symbol] #print df_classData.to_csv() success = float(0) success_up = float(0) success_down = float(0) count = 0 for i in range(i_trainPeriod, df_data.index.size - i_forwardlook + 1): day = df_data.index[i] na_data = df_data.iloc[i - i_trainPeriod:i].values y_train = df_classData.iloc[i - i_trainPeriod:i].values.ravel() x_predict = df_data.iloc[i].values #print "{} - nans: data:{} class:{} x_predict:{} price:{}".format(day, np.count_nonzero(np.isnan(na_data)), np.count_nonzero(np.isnan(y_train)), np.count_nonzero(np.isnan(x_predict)), d_dfData['close'][s_symbol][day]) try: assert_all_finite(na_data) assert_all_finite(y_train) assert_all_finite(x_predict) except ValueError: continue if b_scaling == True: scaler = preprocessing.StandardScaler().fit(na_data) x_train = scaler.transform(na_data) else: x_train = na_data if b_pca == True: pca = decomposition.PCA(n_components = 40) pca.fit(x_train) x_train = pca.transform(x_train) x_predict = pca.transform(x_predict) i_prediction = fc_learnerFactory(x_train, y_train, x_predict) if (i_prediction == df_classData.iloc[i][0]): success += 1 if (i_prediction == 1): success_up += 1 else: success_down += 1 count += 1 #sys.stdout.write(str(all_count - count) + " to go\r") if count == 0: print symbol + " no prediction" else: print symbol + " success rate: " + str(success/count) + " up: " + str(success_up/count) + " down: " + str(success_down/count) + " count: " + str(count)
np.ravel(yKrn) safe_asarray(yKrn) #.ravel() np.asarray_chkfinite(yKrn) print("l885: yKrn", type(yKrn) ) #yKrn = yKrn.astype(numpy.float32, copy=False) #safe_asarray(yKrn).ravel()#print(len(yKrn) ) np.array(yKrn,float); as_float_array(XKrn); as_float_array(yKrn) #yKrn.astype(float) np.float64(yKrn); np.asarray( yKrn ) #warn_if_not_float(yKrn) XKrn = XKrn[np.logical_not(np.isnan(XKrn))]; yKrn = XKrn[np.logical_not(np.isnan(yKrn))] assert_all_finite(XKrn); assert_all_finite(yKrn) #X_vec = np.vectorize(XKrn) #y_vec = np.vectorize(yKrn) XKrn.ravel(), yKrn.ravel() print("912: XKrn row, yKrn row", XKrn.shape, yKrn.shape ) #new_list =[ (F, T) [boolean test] for x in old_list ] #chk0 = [ (0.001) [i == True] for i in yKrn ] print ("minX", np.min(XKrn) ) print ("y shape", yKrn.shape) indices = (y == False).nonzero() # np.nonzero(yKrn) print ("**nonzero yKrn", indices, y[indices]) for i in indices: yKrn[i]=1e-10 indx = np.nonzero(XKrn) for i in indx: XKrn[i]=1e-10
def score_predictor_statistics(y_true, y_pred, score_domain=(1, 5, 1)): """ Full Statistics of prediction performance * n_samples * mean_absolute_error: mean, stdev * mean_squared_error: mean, rmse, stdev * predicted: mean, stdev * true: mean, stdev Parameters ---------- y_true : array, shape=(n_samples,) Ground truth scores y_pred : array, shape=(n_samples,) Predicted scores score_domain : array, shape=(3,) Domain of scores, represented by a triple: start, end, and stride default=(1, 5, 1). Returns ------- stats : dict Full statistics of prediction performance """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = {} # dataset size stats['n_samples'] = y_true.size # a list of possible score levels stats['score levels'] = np.hstack([ np.arange(score_domain[0], score_domain[1], score_domain[2], dtype=float), score_domain[1]]) # mean absolute error mean, stdev = mean_absolute_error(y_true, y_pred) stats['mean absolute error'] = {'mean': mean, 'stdev': stdev} # root mean squared error rmse, mean, stdev = mean_squared_error(y_true, y_pred) stats['mean squared error'] = {'rmse': rmse, 'mean': mean, 'stdev': stdev} # descriptive statistics of ground truth scores stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)} hist, _ = score_histogram(y_true, score_domain=score_domain) stats['true']['histogram'] = hist stats['true']['histogram density'] = (hist / hist.sum()) # descriptive statistics of ground predicted scores stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)} hist, _ = score_histogram(y_pred, score_domain=score_domain) stats['predicted']['histogram'] = hist stats['predicted']['histogram density'] = (hist / hist.sum()) return stats
def fit(self, X, y, Z): """Compute the Semi-Supervised Shared Response Model Parameters ---------- X : list of 2D arrays, element i has shape=[voxels_i, n_align] Each element in the list contains the fMRI data for alignment of one subject. There are n_align samples for each subject. y : list of arrays of int, element i has shape=[samples_i] Each element in the list contains the labels for the data samples in Z. Z : list of 2D arrays, element i has shape=[voxels_i, samples_i] Each element in the list contains the fMRI data of one subject for training the MLR classifier. """ logger.info('Starting SS-SRM') # Check that the alpha value is in range (0.0,1.0) if 0.0 >= self.alpha or self.alpha >= 1.0: raise ValueError("Alpha parameter should be in range (0.0, 1.0)") # Check that the regularizer value is positive if 0.0 >= self.gamma: raise ValueError("Gamma parameter should be positive.") # Check the number of subjects if len(X) <= 1 or len(y) <= 1 or len(Z) <= 1: raise ValueError("There are not enough subjects in the input " "data to train the model.") if not (len(X) == len(y)) or not (len(X) == len(Z)): raise ValueError("Different number of subjects in data.") # Check for input data sizes if X[0].shape[1] < self.features: raise ValueError( "There are not enough samples to train the model with " "{0:d} features.".format(self.features)) # Check if all subjects have same number of TRs for alignment # and if alignment and classification data have the same number of # voxels per subject. Also check that there labels for all the classif. # sample number_trs = X[0].shape[1] number_subjects = len(X) for subject in range(number_subjects): assert_all_finite(X[subject]) assert_all_finite(Z[subject]) if X[subject].shape[1] != number_trs: raise ValueError("Different number of alignment samples " "between subjects.") if X[subject].shape[0] != Z[subject].shape[0]: raise ValueError("Different number of voxels between alignment" " and classification data (subject {0:d})" ".".format(subject)) if Z[subject].shape[1] != y[subject].size: raise ValueError("Different number of samples and labels in " "subject {0:d}.".format(subject)) # Map the classes to [0..C-1] new_y = self._init_classes(y) # Run SS-SRM self.w_, self.s_, self.theta_, self.bias_ = self._sssrm(X, Z, new_y) return self