示例#1
0
 def __init__(self, X, y, n_classes, batch_size):
     check_arrays(X, dtype=np.float32)
     check_arrays(y, dtype=None)
     self.X = X
     self.y = y
     self.n_classes = n_classes
     self.batch_size = batch_size
def benchmark(clf, X, y, cv=None):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(clf))
    
    # learning_curve_ = learning_curve(clf, X_all, y_all, cv=cv)
    
    train_times = []
    test_times = []
    confusion_matrices = []
    confusion_matrix_indices = []
    coefs = []
    for train, test in cv:
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        
        t0 = time()
        clf.fit(X_train, y_train)
        train_times.append(time()-t0)
        
        t0 = time()
        y_pred = clf.predict(X_test)
        test_times.append(time()-t0)
    
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
        confusion_matrix_indices.append(np.array([[test[pred] for pred in true] for true in confusion_matrix_instances(y_test, y_pred)]))
    
        coefs.append(clf.coef_)
    
    return dict(
        train_times = np.array(train_times),
        test_times = np.array(test_times),
        confusion_matrices = np.array(confusion_matrices),
        confusion_matrix_indices = np.array(confusion_matrix_indices),
        coefs = np.array(coefs)
    )
示例#3
0
    def fit(self, X, y=None):
        """Fit the model to the data X.

        Parameters
        ----------
        X : {array-like, sparse matrix} shape (n_samples, n_features)
            Training data.

        Returns
        -------
        self : BernoulliRBM
            The fitted model.
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        n_samples = X.shape[0]
        rng = check_random_state(self.random_state)

        self.components_ = np.asarray(rng.normal(
            0, 0.01, (self.n_components, X.shape[1])),
                                      order='fortran')
        self.intercept_hidden_ = np.zeros(self.n_components, )
        self.intercept_visible_ = np.zeros(X.shape[1], )
        self.h_samples_ = np.zeros((self.batch_size, self.n_components))

        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        batch_slices = list(
            gen_even_slices(n_batches * self.batch_size, n_batches, n_samples))

        for iteration in xrange(1, self.n_iter + 1):
            for batch_slice in batch_slices:
                self._fit(X[batch_slice], rng)
        return self
示例#4
0
文件: bag.py 项目: orazaro/kgml
    def predict(self, X):
        """Predict regression target for X.

The predicted regression target of an input sample is computed as the
mean predicted regression targets of the estimators in the ensemble.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
y : array of shape = [n_samples]
The predicted values.
"""
        # Check data
        X, = check_arrays(X)

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self)

        all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_predict_regression)(
                self.estimators_[starts[i]:starts[i + 1]],
                self.estimators_features_[starts[i]:starts[i + 1]],
                X)
            for i in range(n_jobs))

        # Reduce
        y_hat = sum(all_y_hat) / self.n_estimators

        return y_hat
示例#5
0
    def fit(self, X, y=None):
        """Fit the model to the data X.

        Parameters
        ----------
        X : {array-like, sparse matrix} shape (n_samples, n_features)
            Training data.

        Returns
        -------
        self : BernoulliRBM
            The fitted model.
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        n_samples = X.shape[0]
        rng = check_random_state(self.random_state)

        self.components_ = np.asarray(
            rng.normal(0, 0.01, (self.n_components, X.shape[1])),
            order='fortran')
        self.intercept_hidden_ = np.zeros(self.n_components, )
        self.intercept_visible_ = np.zeros(X.shape[1], )
        self.h_samples_ = np.zeros((self.batch_size, self.n_components))

        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        batch_slices = list(gen_even_slices(n_batches * self.batch_size,
                                            n_batches, n_samples))

        for iteration in xrange(1, self.n_iter + 1):
            for batch_slice in batch_slices:
                self._fit(X[batch_slice], rng)
        return self
示例#6
0
    def plot_vs_cut(self, y_true, proba, sample_weight=None):
        """
        Compute metric for each possible prediction threshold

        :param y_true: array-like true labels
        :param proba: array-like of shape [n_samples, 2] with predicted probabilities
        :param sample_weight: array-like weight

        :rtype: plotting.FunctionsPlot
        """
        from .. import plotting

        y_true, proba, sample_weight = check_arrays(y_true, proba,
                                                    sample_weight)
        ordered_proba, metrics_val = self.compute(y_true, proba, sample_weight)
        ind = numpy.argmax(metrics_val)

        print('Optimal cut=%1.4f, quality=%1.4f' %
              (ordered_proba[ind], metrics_val[ind]))

        plot_fig = plotting.FunctionsPlot(
            {self.metric.__name__: (ordered_proba, metrics_val)})
        plot_fig.xlabel = 'cut'
        plot_fig.ylabel = 'metrics ' + self.metric.__name__
        return plot_fig
示例#7
0
    def predict_proba(self, X):
        """Predict class probabilities for X.

        The predicted class probabilities of an input sample is computed as
        the predicted class probabilities of the underlying estimators. If
        base estimators do not implement a ``predict_proba`` method, then
        ``NotImplementError`` is raise.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        Returns
        -------
        proba : array of shape = [n_samples, n_classes]
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        # Check data
        X, = check_arrays(X)

        if not hasattr(self.estimator_, "predict_proba"):
            raise NotImplementedError('Underlying estimator of class' +
                                      self.estimator_.__class__ +
                                      'has no attibute ``pedict_proba``.')

        return self.estimator_.predict_proba(X)
示例#8
0
    def predict(self, X):
        """Predict regression target for X.

The predicted regression target of an input sample is computed as the
mean predicted regression targets of the estimators in the ensemble.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
y : array of shape = [n_samples]
The predicted values.
"""
        # Check data
        X, = check_arrays(X)

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self)

        all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_predict_regression)(
                self.estimators_[starts[i]:starts[i + 1]],
                self.estimators_features_[starts[i]:starts[i + 1]], X)
            for i in range(n_jobs))

        # Reduce
        y_hat = sum(all_y_hat) / self.n_estimators

        return y_hat
示例#9
0
def loadMultiData(filenames,
                  weights,
                  weight_types,
                  isMergeFeatures=False,
                  data_size=-1):
    print "Loading data from multiple files: weights=%s, types=%s" % (
        str(weights), str(weight_types))
    datas = []
    for index in range(0, len(filenames)):
        filename = filenames[index]
        weight_type = weight_types[index]
        data = loadData(filename, weight_type=weight_type, data_size=data_size)
        datas.append(data)
    # Combine multiple matrices
    # Does not merge features. Just concatenate all feature spaces
    if isMergeFeatures == False:
        combined = datas[0] * weights[0]
        for i in range(1, len(datas)):
            data = datas[i]
            data = data * weights[i]
            combined = sp.hstack([combined, data])
        combined = check_arrays(
            combined, sparse_format="csr", copy=False,
            dtype=np.float64)[0]  # convert to the type: csr sparse matrix
        # Changed: normalization is done in each method
        # combined_norms = normalize(combined,'l2',axis=1,copy=False) # Squared euclidean norm of each data point.
        return combined  #.astype('f') #convert type to float32 to save space
    else:  #Donot implement the merging features version yet.
        pass
示例#10
0
    def _transform(self, X):
        """Assumes X contains only categorical features."""
        X = check_arrays(X, sparse_format='csc', allow_nans=True)[0]
        n_samples, n_features = X.shape

        indices = self.feature_indices_
        if n_features != len(indices):
            raise ValueError("X has different shape than during fitting."
                             " Expected %d, got %d."
                             % (len(indices), n_features))

        row_indices = np.tile(np.arange(n_samples, dtype=np.int32),
                              n_features)

        data = []
        column_indices = []

        for idx, feature in enumerate(range(n_features)):
            offset = np.sum(self.n_values[:idx+1])
            feature_indices_idx = self.feature_indices_[idx]
            column_indices_idx = [feature_indices_idx.get(x, offset)
                                  for x in X[:,idx]]
            data_idx = [1 if feature_indices_idx.get(x) is not None else 0
                        for x in X[:, idx]]

            column_indices.extend(column_indices_idx)
            data.extend(data_idx)

        out = sparse.coo_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, np.sum(self.n_values)),
                                dtype=self.dtype).tocsr()

        return out if self.sparse else out.toarray()
示例#11
0
    def transform(self, X, y=None, copy=None):
        """Perform standardization by centering and scaling

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        copy = copy if copy is not None else self.copy
        X = check_arrays(X, copy=copy, sparse_format="csc")[0]
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                for i in range(X.shape[1]):
                    X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i]
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                pass

            if self.std_ is not None:
                inplace_column_scale(X, 1 / self.std_)
        else:
            if self.with_mean:
                X -= self.mean_
            if self.with_std:
                X /= self.std_
        return X
示例#12
0
文件: mape.py 项目: martin1/thesis
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = check_arrays(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 2)
示例#13
0
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = check_arrays(y_true, y_pred)

        ## Note: does not handle mix 1d representation
        #if _is_1d(y_true):
        #    y_true, y_pred = _check_1d_array(y_true, y_pred)

        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
示例#14
0
    def fit(self, X, y=None):
        """Don't trust the documentation of this module!

        Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : array-like or CSR matrix with shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.
        """
        X = check_arrays(X, copy=self.copy, sparse_format="csc")[0]
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                means = []
                vars = []

                # This only works for csc matrices...
                for i in range(X.shape[1]):
                    if X.indptr[i] == X.indptr[i + 1]:
                        means.append(0)
                        vars.append(1)
                    else:
                        vars.append(X.data[X.indptr[i]:X.indptr[i + 1]].var())
                        # If the variance is 0, set all occurences of this
                        # features to 1
                        means.append(X.data[X.indptr[i]:X.indptr[i +
                                                                 1]].mean())
                        if 0.0000001 >= vars[-1] >= -0.0000001:
                            means[-1] -= 1

                self.std_ = np.sqrt(np.array(vars))
                self.std_[np.array(vars) == 0.0] = 1.0
                self.mean_ = np.array(means)

                return self
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                self.mean_ = None

            if self.with_std:
                var = mean_variance_axis0(X)[1]
                self.std_ = np.sqrt(var)
                self.std_[var == 0.0] = 1.0
            else:
                self.std_ = None
            return self
        else:
            self.mean_, self.std_ = _mean_and_std(X,
                                                  axis=0,
                                                  with_mean=self.with_mean,
                                                  with_std=self.with_std)
        return self
示例#15
0
def loadData(filename,
             weight_type="count",
             feature_perc=1.0,
             data_size=-1,
             items_subset=set()):
    """
    filename: the name of data file
    weight_type: weight type of tokens: "count","tf-idf","tf"
    feature_pect: percentage of features to select according to its weighted values. (count: #document contains the feature; tfidf: tf-idf value)
    data_size: #data to select. default is -1, means select all data.
    item_subset: only read lines in the item_subset, which is a set of line numbers
    """
    weight_type = weight_type.lower()
    print "Loading data from %s, weight_type = %s" % (filename, weight_type)
    raw_data = []
    lines = file(filename, "r").readlines()
    i = 0
    for line in lines:
        if len(items_subset) == 0 or (i in items_subset):
            line = line.replace('\n', '')
            arr = ast.literal_eval(line.split("|")[2])  #converted to list
            raw_data.append(arr)
            if (data_size > 0 and i >= data_size): break
        i = i + 1
    data, feature_names, features_score = fit_transform(
        raw_data, weight_type=weight_type
    )  # type of data is <class 'scipy.sparse.coo.coo_matrix'>
    #write_features("%s_features" %filename, feature_names)
    # feature selection
    # Note: after selection, the vocabulary of features names are changed! Not modify them yet!
    if feature_perc < 1:
        n_features = data.shape[1]
        upbound = n_features * feature_perc
        dict_feat_score = {}
        for i in range(0, len(features_score)):
            dict_feat_score[i] = features_score[i]
        sorted_list = sorted(
            dict_feat_score.items(), key=lambda d: d[1], reverse=True
        )  #Sort the dict by its values, descending(reverse=True),ascending(reverse=False)
        selected_features = []
        i = 0
        for key, value in sorted_list:
            selected_features.append(key)
            i = i + 1
            if i > upbound:
                break
        data = data.tocsc()
        data = data[:, selected_features]
        print "\t Selected top %d(%f) from %d features:" % (
            len(selected_features), feature_perc, n_features)
    # print vectorizer.get_features_tfidf()
    # print vectorizer.vocabulary_
    data = check_arrays(
        data, sparse_format="csr", copy=False,
        dtype=np.float64)[0]  # convert to the type: csr sparse matrix
    # Change: Normalization is done in each method
    # data_norms = normalize(data,'l2',axis=1,copy=False) # Squared euclidean norm(l2-norm) of each data point.
    return data  #.astype('f') #convert type to float32 to save space
示例#16
0
    def fit(self, X, y):
        """Fit MLP Classifier according to X, y

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Training vectors, where n_samples is the number of samples
        and n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_classes]
        Target values. It determines the problem type.

        *binary*
        If y is a vector of integers with two unique values.

        *multiclass*
        If y is a vector of integers with three or more values
        or if y is a two-dimensional array of integers and there exists only
        one non-zero element per row.

        *multiclass-multioutput*
        If y is two-dimensional array of integers with two unique values
        and there exists more than one non-zero element per row.

        *continuous*
        If y is a vector of floats.

        *continuous-multioutput*
        If y is a two-dimensional array of floats.

        Returns
        -------
        self : object
        Returns self.
        """
        X, = check_arrays(X, sparse_format='dense')

        n_samples, self.input_size_ = X.shape

        y = np.atleast_1d(y)

        self.type_of_target_ = type_of_target(y)
        if self.verbose > 0:
            print("The inferred type of y is %s" % self.type_of_target_)
        if self.type_of_y != None:
            if self.type_of_y != self.type_of_target_:
                print("Passed type of y is %s, inferred type is %s"
                      % (self.type_of_y, self.type_of_target_))
                raise("derp")

        self.check_type_implemented()
        y = self._get_output(y)
        X, y = self._scale(X, y)
        self._inst_mlp()
        self._fit_mlp(X, y)
        if self.dropout and self.type_of_target_ in ['continuous', 'continuous-multioutput']:
            self._lineregress(X, y)
示例#17
0
    def fit(self, X, y, store_covariances=False, tol=1.0e-4):
        """
        Fit the QDA model according to the given training data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array, shape = [n_samples]
            Target values (integers)

        store_covariances : boolean
            If True the covariance matrices are computed and stored in the
            `self.covariances_` attribute.
        """
        X, y = check_arrays(X, y)
        self.classes_, y = unique(y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        if self.priors is None:
            self.priors_ = np.bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors

        cov = None
        if store_covariances:
            cov = []
        means = []
        scalings = []
        rotations = []
        for ind in xrange(n_classes):
            Xg = X[y == ind, :]
            meang = Xg.mean(0)
            means.append(meang)
            Xgc = Xg - meang
            # Xgc = U * S * V.T
            U, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
            rank = np.sum(S > tol)
            if rank < n_features:
                warnings.warn("Variables are collinear")
            S2 = (S**2) / (len(Xg) - 1)
            if store_covariances:
                # cov = V * (S^2 / (n-1)) * V.T
                cov.append(np.dot(S2 * Vt.T, Vt))
            scalings.append(S2)
            rotations.append(Vt.T)
        if store_covariances:
            self.covariances_ = cov
        self.means_ = np.asarray(means)
        self.scalings_ = np.asarray(scalings)
        self.rotations_ = rotations
        return self
    def fit_all(self, X, y, n_shop, last_obs_plan):
        # if not warmstart - clear the estimator state
        if not self.warm_start:
            self._clear_state()

        # Check input
        X, = check_arrays(X, dtype=DTYPE, sparse_format="dense")
        y = column_or_1d(y, warn=True)
        n_samples, n_features = X.shape
        self.n_features = n_features
        random_state = check_random_state(self.random_state)
        self._check_params()

        if not self._is_initialized():
            if self.verbose:
                print 'Initializing gradient boosting...'
            # init state
            self._init_state()

            # fit initial model
            if not self.fix_history:
                idx = get_truncated_shopping_indices(n_shop)
            else:
                idx = np.arange(len(n_shop))

            # init predictions by averaging over the shopping histories
            y_pred = self.init_.predict(last_obs_plan[idx])
            print 'First training accuracy:', accuracy_score(y, y_pred.argmax(axis=1))
            begin_at_stage = 0
        else:
            # add more estimators to fitted model
            # invariant: warm_start = True
            if self.n_estimators < self.estimators_.shape[0]:
                raise ValueError('n_estimators=%d must be larger or equal to '
                                 'estimators_.shape[0]=%d when '
                                 'warm_start==True'
                                 % (self.n_estimators,
                                    self.estimators_.shape[0]))
            begin_at_stage = self.estimators_.shape[0]
            y_pred = self.decision_function(X)
            self._resize_state()

        # fit the boosting stages
        n_stages = self._fit_stages(X, y, y_pred, random_state, begin_at_stage, n_shop)
        # change shape of arrays after fit (early-stopping or additional tests)
        if n_stages != self.estimators_.shape[0]:
            self.estimators_ = self.estimators_[:n_stages]
            self.train_score_ = self.train_score_[:n_stages]
            if hasattr(self, 'oob_improvement_'):
                self.oob_improvement_ = self.oob_improvement_[:n_stages]
            if hasattr(self, '_oob_score_'):
                self._oob_score_ = self._oob_score_[:n_stages]

        return self
示例#19
0
 def inverse_transform(self, X):
     """Undo the scaling of X according to feature_range.
     Parameters
     ----------
     X : array-like with shape [n_samples, n_features]
     Input data that will be transformed.
     """
     X = check_arrays(X, sparse_format="dense", copy=self.copy)[0]
     X -= self.min_
     X /= self.scale_
     return X
示例#20
0
文件: mrmr.py 项目: kemaleren/sklmrmr
    def fit(self, X, y):
        self._validate()
        X, y = check_arrays(X, y, sparse_format="csc")
        n_samples, n_features = X.shape

        # discretize continuous features
        if np.issubdtype(X.dtype, float):
            X_new = X.astype(np.int)
            if np.any(X_new != X):
                raise ValueError('X could not safely be converted to integers.'
                                 ' MRMR does not support continuous values.')
            X = X_new

        if np.issubdtype(y.dtype, float):
            y_new = y.astype(np.int)
            if np.any(y_new != y):
                raise ValueError('y could not safely be converted to integers.'
                                 ' MRMR does not support continuous values.')
            y = y_new

        if self.k is None:
            k = n_features // 2
        else:
            k = self.k

        X_classes = np.array(list(set(X.reshape((n_samples * n_features,)))))
        y_classes = np.array(list(set(y.reshape((n_samples,)))))

        if len(X_classes) > self.warn_limit:
            print('Warning: X contains {} discrete values. MRMR may'
                  ' run slow'.format(len(X_classes)))
        if len(y_classes) > self.warn_limit:
            print('Warning: y contains {} discrete values. MRMR may'
                  ' run slow'.format(len(y_classes)))

        method = self.methods[self.method]
        idxs, _ = _mrmr(n_samples, n_features, y.astype(np.long),
                        X.astype(np.long), y_classes.astype(np.long),
                        X_classes.astype(np.long), y_classes.shape[0],
                        X_classes.shape[0], k, method, self.normalize)

        support_ = np.zeros(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int) + k

        support_[idxs] = True
        for i, idx in enumerate(idxs, start=1):
            ranking_[idx] = i

        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_
        self.selected_ = np.argsort(self.ranking_)[:self.n_features_]

        return self
示例#21
0
    def fit(self, X):
        """Fit the model to the data X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training Data
        i
        Returns
        -------
        self: convolutionRBM
              The fitted Model.
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        n_samples = X.shape[0]
        rng = check_random_state(self.random_state)

        self.components_ = np.asarray(rng.normal(
            0, 0.001, (self.n_groups, self.window_size * self.window_size)),
                                      order='fortran')
        self.intercept_hidden_ = np.zeros(self.n_groups)

        self.intercept_visible_ = 0

        self.h_samples_ = np.zeros((self.batch_size, self.n_components))

        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        batch_slices = list(
            self.gen_even_slices(n_batches * self.batch_size, n_batches,
                                 n_samples))

        verbose = self.verbose

        for iteration in xrange(1, self.n_iter + 1):
            reconstructError = 0
            for batch_slice in batch_slices:
                if (not self.use_theano):
                    reconstructError += self._fit(X[batch_slice], rng)
                else:
                    reconstructError += self._fit_theano(X[batch_slice], rng)

            print "step:", iteration, "reconstruct Error: ", reconstructError

            if verbose:
                end = time.time()
                print(
                    "[%s] Iteration %d, pseudo-likelihood = %.2f,"
                    " time = %.2fs" %
                    (type(self).__name__, iteration,
                     self.score_samples(X).mean(), end - begin))
                begin = end

        return self
示例#22
0
def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):

    y_true, y_pred = check_arrays(y_true, y_pred)
    assert(beta > 0)
    if labels is None:
        labels = unique_labels(y_true, y_pred)
    else:
        labels = np.asarray(labels, dtype=np.int)

    n_labels = labels.size
    true_pos = np.zeros(n_labels, dtype=np.double)
    false_pos = np.zeros(n_labels, dtype=np.double)
    false_neg = np.zeros(n_labels, dtype=np.double)
    support = np.zeros(n_labels, dtype=np.long)

    for i, label_i in enumerate(labels):
        true_pos[i] = np.sum(y_pred[y_true == label_i] == label_i)
        false_pos[i] = np.sum(y_pred[y_true != label_i] == label_i)
        false_neg[i] = np.sum(y_pred[y_true == label_i] != label_i)
        support[i] = np.sum(y_true == label_i)

    try:
        # oddly, we may get an "invalid" rather than a "divide" error here
        old_err_settings = np.seterr(divide='ignore', invalid='ignore')

        # precision and recall
        # Micro-averaging is used
        precision = true_pos.sum() / (true_pos.sum() + false_pos.sum())
        recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())

        # print false_pos
        # print false_neg
        # print false_pos.sum()
        # print false_neg.sum()

        # # handle division by 0.0 in precision and recall
        # precision[(true_pos + false_pos) == 0.0] = 0.0
        # recall[(true_pos + false_neg) == 0.0] = 0.0

        # fbeta score
        beta2 = beta ** 2
        fscore = (1 + beta2) * (precision * recall) / (
            beta2 * precision + recall)
        # print (beta2 * precision + recall)

        # handle division by 0.0 in fscore
        if (precision + recall) == 0.0:
            fscore = 0.0
        # fscore[(precision + recall) == 0.0] = 0.0
    finally:
        np.seterr(**old_err_settings)

    return precision, recall, fscore, support
示例#23
0
def test_check_arrays():
    # check that error is raised on different length inputs
    X = [0, 1]
    Y = np.arange(3)
    assert_raises(ValueError, check_arrays, X, Y)

    # check error for sparse matrix and array
    X = sp.csc_matrix(np.arange(4))
    assert_raises(ValueError, check_arrays, X, Y)

    # check they y=None pattern
    X = [0, 1, 2]
    X_, Y_, Z_ = check_arrays(X, Y, None)
    assert_true(Z_ is None)

    # check that lists are converted
    X_, Y_ = check_arrays(X, Y)
    assert_true(isinstance(X_, np.ndarray))
    assert_true(isinstance(Y_, np.ndarray))

    # check that Y was not copied:
    assert_true(Y_ is Y)

    # check copying
    X_, Y_ = check_arrays(X, Y, copy=True)
    assert_false(Y_ is Y)

    # check forcing dtype
    X_, Y_ = check_arrays(X, Y, dtype=np.int)
    assert_equal(X_.dtype, np.int)
    assert_equal(Y_.dtype, np.int)

    X_, Y_ = check_arrays(X, Y, dtype=np.float)
    assert_equal(X_.dtype, np.float)
    assert_equal(Y_.dtype, np.float)

    # test check_ccontiguous
    Y = np.arange(6).reshape(3, 2).copy('F')
    # if we don't specify it, it is not changed
    X_, Y_ = check_arrays(X, Y)
    assert_true(Y_.flags['F_CONTIGUOUS'])
    assert_false(Y_.flags['C_CONTIGUOUS'])

    X_, Y_ = check_arrays(X, Y, check_ccontiguous=True)
    assert_true(Y_.flags['C_CONTIGUOUS'])
    assert_false(Y_.flags['F_CONTIGUOUS'])

    # check that lists are passed through if allow_lists is true
    X_, Y_ = check_arrays(X, Y, allow_lists=True)
    assert_true(isinstance(X_, list))
示例#24
0
文件: bag.py 项目: orazaro/kgml
    def predict_log_proba(self, X):
        """Predict class log-probabilities for X.

The predicted class log-probabilities of an input sample is computed as
the log of the mean predicted class probabilities of the base
estimators in the ensemble.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
p : array of shape = [n_samples, n_classes]
The class log-probabilities of the input samples. Classes are
ordered by arithmetical order.
"""
        if hasattr(self.base_estimator_, "predict_log_proba"):
            # Check data
            X, = check_arrays(X)

            if self.n_features_ != X.shape[1]:
                raise ValueError("Number of features of the model must "
                                 "match the input. Model n_features is {0} "
                                 "and input n_features is {1} "
                                 "".format(self.n_features_, X.shape[1]))

            # Parallel loop
            n_jobs, n_estimators, starts = _partition_estimators(self)

            all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
                delayed(_parallel_predict_log_proba)(
                    self.estimators_[starts[i]:starts[i + 1]],
                    self.estimators_features_[starts[i]:starts[i + 1]],
                    X,
                    self.n_classes_)
                for i in range(n_jobs))

            # Reduce
            log_proba = all_log_proba[0]

            for j in range(1, len(all_log_proba)):
                log_proba = logaddexp(log_proba, all_log_proba[j])

            log_proba -= np.log(self.n_estimators)

            return log_proba

        else:
            return np.log(self.predict_proba(X))
示例#25
0
    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(k,
                                       self.feature_vector,
                                       self.classification_vector,
                                       classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                                 self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count
                        and negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count
                      and neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count
                      and positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count
                      and negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))
示例#26
0
    def fit(self, X, winnerTakeAll,plList, y=None,):
        """Fit the model to the data X.

        Parameters
        ----------
        X : {array-like, sparse matrix} shape (n_samples, n_features)
            Training data.

        Returns
        -------
        self : BernoulliRBM
            The fitted model.
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        n_samples = X.shape[0]
        rng = check_random_state(self.random_state)

        self.components_ = np.asarray(
            rng.normal(0, 0.001, (self.n_components, X.shape[1])),
            order='fortran')
        self.intercept_hidden_ = np.zeros(self.n_components, )
        self.intercept_visible_ = np.zeros(X.shape[1], )
        self.h_samples_ = np.zeros((self.batch_size, self.n_components))

        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        batch_slices = list(self.gen_even_slices(n_batches * self.batch_size,n_batches, n_samples))
        verbose = self.verbose
        for iteration in xrange(self.n_iter):
            pl = 0.
            if verbose:
                begin = time.time()
	    
	    batch_index = 0
            for batch_slice in batch_slices:
		if(batch_index + 1 != n_batches - 1):
			#next_batch = batch_slice		
        		next_h_pos_mean_hidden = self._mean_hiddens(X[batch_index + 1])
                pl_batch = self._fit(X[batch_slice], rng,winnerTakeAll)
		if verbose:
                    pl += pl_batch.sum()
                    #self.printOutWeight()
		batch_index = batch_index + 1

            if verbose:
                pl /= n_samples
                end = time.time()
                print("Iteration %d, pseudo-likelihood = %.2f, time = %.2fs"
                      % (iteration, pl, end - begin))
                plList[iteration] = pl
	    #self.printOutWeight()
        return self
示例#27
0
def f_regression_cov(X, y, C):
    """Univariate linear regression tests

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 3 steps:
    1. the regressor of interest and the data are orthogonalized
    wrt constant regressors
    2. the cross correlation between data and regressors is computed
    3. it is converted to an F score then to a p-value

    Parameters
    ----------
    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples).
        The data matrix

    c : {array-like, sparse matrix}  shape = (n_samples, n_covariates)
        The set of covariates.


    Returns
    -------
    F : array, shape=(n_features,)
        F values of features.

    pval : array, shape=(n_features,)
        p-values of F-scores.
    """

    X, C, y = check_arrays(X, C, y, dtype=np.float)
    y = y.ravel()

    assert C.shape[1] < C.shape[0]
    cpinv = np.linalg.pinv(C)
    X -= np.dot(C,(np.dot(cpinv, X)))
    y -= np.dot(C,(np.dot(cpinv, y)))

    # compute the correlation
    corr = np.dot(y, X)
    corr /= np.asarray(np.sqrt(safe_sqr(X).sum(axis=0))).ravel()
    corr /= np.asarray(np.sqrt(safe_sqr(y).sum())).ravel()

    # convert to p-value
    dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm))
    F = corr ** 2 / (1 - corr ** 2) * dof
    pv = stats.f.sf(F, 1, dof)
    return F, pv
示例#28
0
    def partial_fit(self, X, y=None, weights=None):
        """Update k means estimate on a single mini-batch X.

        Parameters
        ----------
        X: array-like, shape = [n_samples, n_features]
            Coordinates of the data points to cluster.
        """

        X = check_arrays(X, sparse_format="csr", copy=False)[0]
        n_samples, n_features = X.shape
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        if n_samples == 0:
            return self

        x_squared_norms = _squared_norms(X)
        self.random_state_ = check_random_state(self.random_state)
        if (not hasattr(self, 'counts_')
                or not hasattr(self, 'cluster_centers_')):
            # this is the first call partial_fit on this object:
            # initialize the cluster centers
            self.cluster_centers_ = _init_centroids(
                X, self.n_clusters, self.init,
                random_state=self.random_state_,
                x_squared_norms=x_squared_norms, init_size=self.init_size, weights=weights)

            self.initial_cluster_centers_ = self.cluster_centers_.copy()

            self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)
            random_reassign = False
        else:
            # The lower the minimum count is, the more we do random
            # reassignment, however, we don't want to do random
            # reassignment too often, to allow for building up counts
            random_reassign = self.random_state_.randint(
                10 * (1 + self.counts_.min())) == 0

        _mini_batch_step(X, x_squared_norms, self.cluster_centers_,
                         self.counts_, np.zeros(0, np.double), 0,
                         random_reassign=random_reassign,
                         random_state=self.random_state_,
                         reassignment_ratio=self.reassignment_ratio,
                         verbose=self.verbose, weights=weights, sphered=self.sphered)

        if self.compute_labels:
            self.labels_, self.inertia_ = _labels_inertia(
                X, x_squared_norms, self.cluster_centers_, weights=weights)

        return self
def _check_clf_targets(y_true, y_pred):
    """Check that y_true and y_pred belong to the same classification task

    This converts multiclass or binary types to a common shape, and raises a
    ValueError for a mix of multilabel and multiclass targets, a mix of
    multilabel formats, for the presence of continuous-valued or multioutput
    targets, or for targets of different lengths.

    Column vectors are squeezed to 1d.

    Parameters
    ----------
    y_true : array-like,

    y_pred : array-like

    Returns
    -------
    type_true : one of {'multilabel-indicator', 'multilabel-sequences', \
    'multiclass', 'binary'}
    The type of the true target data, as output by
    ``utils.multiclass.type_of_target``

    y_true : array or indicator matrix or sequence of sequences

    y_pred : array or indicator matrix or sequence of sequences
    """

    y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True)
    type_true = type_of_target(y_true)
    type_pred = type_of_target(y_pred)

    y_type = set([type_true, type_pred])
    if y_type == set(["binary", "multiclass"]):
        y_type = set(["multiclass"])

    if len(y_type) > 1:
        raise ValueError("Can't handle mix of {0} and {1}" "".format(type_true, type_pred))

    # We can't have more than one value on y_type => The set is no more needed
    y_type = y_type.pop()

    # No metrics support "multiclass-multioutput" format
    if y_type not in ["binary", "multiclass", "multilabel-indicator", "multilabel-sequences"]:
        raise ValueError("{0} is not supported".format(y_type))

    if y_type in ["binary", "multiclass"]:
        y_true = column_or_1d(y_true)
        y_pred = column_or_1d(y_pred)

    return y_type, y_true, y_pred
示例#30
0
    def predict_log_proba(self, X):
        """Predict class log-probabilities for X.

The predicted class log-probabilities of an input sample is computed as
the log of the mean predicted class probabilities of the base
estimators in the ensemble.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
p : array of shape = [n_samples, n_classes]
The class log-probabilities of the input samples. Classes are
ordered by arithmetical order.
"""
        if hasattr(self.base_estimator_, "predict_log_proba"):
            # Check data
            X, = check_arrays(X)

            if self.n_features_ != X.shape[1]:
                raise ValueError("Number of features of the model must "
                                 "match the input. Model n_features is {0} "
                                 "and input n_features is {1} "
                                 "".format(self.n_features_, X.shape[1]))

            # Parallel loop
            n_jobs, n_estimators, starts = _partition_estimators(self)

            all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
                delayed(_parallel_predict_log_proba)
                (self.estimators_[starts[i]:starts[i + 1]], self.
                 estimators_features_[starts[i]:starts[i +
                                                       1]], X, self.n_classes_)
                for i in range(n_jobs))

            # Reduce
            log_proba = all_log_proba[0]

            for j in range(1, len(all_log_proba)):
                log_proba = logaddexp(log_proba, all_log_proba[j])

            log_proba -= np.log(self.n_estimators)

            return log_proba

        else:
            return np.log(self.predict_proba(X))
示例#31
0
    def _fit_transform(self, X):
        self.nbrs_ = NearestNeighbors(self.n_neighbors,
                                      algorithm=self.neighbors_algorithm)

        random_state = check_random_state(self.random_state)
        X, = check_arrays(X, sparse_format='dense')
        self.nbrs_.fit(X)
        self.embedding_, self.reconstruction_error_ = \
            locally_linear_embedding(
                self.nbrs_, self.n_neighbors, self.n_components,
                eigen_solver=self.eigen_solver, tol=self.tol,
                max_iter=self.max_iter, method=self.method,
                hessian_tol=self.hessian_tol, modified_tol=self.modified_tol,
                random_state=random_state)
示例#32
0
    def fit(self, X):
        """Fit the model to the data X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training Data
        i
        Returns
        -------
        self: convolutionRBM
              The fitted Model.
        """
        X, = check_arrays(X, sparse_format = 'csr', dtype = np.float)
        n_samples = X.shape[0]
        rng = check_random_state(self.random_state)
        
        self.components_ = np.asarray(
            rng.normal(0, 0.001, (self.n_groups,self.window_size * self.window_size)),order='fortran')
        self.intercept_hidden_ = np.zeros(self.n_groups)

        self.intercept_visible_ = 0
                        

        self.h_samples_ = np.zeros((self.batch_size, self.n_components))

        n_batches = int(np.ceil(float(n_samples)/ self.batch_size))
        batch_slices = list(self.gen_even_slices(n_batches * self.batch_size, n_batches, n_samples))
       
        verbose = self.verbose
        
        for iteration in xrange(1, self.n_iter + 1):
            reconstructError = 0
            for batch_slice in batch_slices:
                if(not self.use_theano):
                    reconstructError += self._fit(X[batch_slice], rng)
                else:
                    reconstructError += self._fit_theano(X[batch_slice],rng)

            print "step:", iteration, "reconstruct Error: ", reconstructError

            if verbose:
                end = time.time()
                print("[%s] Iteration %d, pseudo-likelihood = %.2f,"
                      " time = %.2fs"
                      % (type(self).__name__, iteration,
                         self.score_samples(X).mean(), end - begin))
                begin = end

        return self
示例#33
0
    def fit(self, X, y=None):
        """Compute the minimum and maximum to be used for later scaling.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to compute the per-feature minimum and maximum
            used for later scaling along the features axis.
        """
        X = check_arrays(X, sparse_format="csc", copy=self.copy)[0]
        warn_if_not_float(X, estimator=self)
        feature_range = self.feature_range
        if feature_range[0] >= feature_range[1]:
            raise ValueError("Minimum of desired feature range must be smaller"
                             " than maximum. Got %s." % str(feature_range))
        if sparse.issparse(X):
            data_min = []
            data_max = []
            data_range = []
            for i in range(X.shape[1]):
                if X.indptr[i] == X.indptr[i + 1]:
                    data_min.append(0)
                    data_max.append(0)
                    data_range.append(0)
                else:
                    data_min.append(X.data[X.indptr[i]:X.indptr[i + 1]].min())
                    data_max.append(X.data[X.indptr[i]:X.indptr[i + 1]].max())
            data_min = np.array(data_min)
            data_max = np.array(data_max)
            data_range = data_max - data_min

        else:
            data_min = np.min(X, axis=0)
            data_range = np.max(X, axis=0) - data_min

        # Do not scale constant features
        if isinstance(data_range, np.ndarray):
            # For a sparse matrix, constant features will be set to one!
            if sparse.issparse(X):
                for i in range(len(data_min)):
                    if data_range[i] == 0.0:
                        data_min[i] = data_min[i] - 1
            data_range[data_range == 0.0] = 1.0
        elif data_range == 0.:
            data_range = 1.

        self.scale_ = (feature_range[1] - feature_range[0]) / data_range
        self.min_ = feature_range[0] - data_min * self.scale_
        self.data_range = data_range
        self.data_min = data_min
        return self
示例#34
0
    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(
            k, self.feature_vector, self.classification_vector,
            classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                          self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count and
                    negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count and
                    neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count and
                    positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count and
                    negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))
示例#35
0
def combineData(datas, weights, norm):
    """
    First normalize each view, then combine
    """
    combined = norm_data(datas[0], norm) * weights[0]
    for i in range(1, len(datas)):
        data = norm_data(datas[i], norm) * weights[i]
        combined = sp.hstack([combined, data])
    combined = check_arrays(
        combined, sparse_format="csr", copy=False,
        dtype=np.float64)[0]  # convert to the type: csr sparse matrix
    # Changed: normalization is done in each method
    # combined_norms = normalize(combined,'l2',axis=1,copy=False) # Squared euclidean norm of each data point.
    return combined  #.astype('f') #convert type to float32 to save space
def f_regression_nosparse(X, y, center=True):
    """Univariate linear regression tests

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 3 steps:
    1. the regressor of interest and the data are orthogonalized
       with respect to constant regressors
    2. the cross correlation between data and regressors is computed
    3. it is converted to an F score then to a p-value

    Parameters
    ----------
    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples).
        The data matrix

    center : True, bool,
        If true, X and y will be centered.

    Returns
    -------
    F : array, shape=(n_features,)
        F values of features.

    pval : array, shape=(n_features,)
        p-values of F-scores.
    """
    X, y = check_arrays(X, y, dtype=np.float)
    y = y.ravel()
    if center:
        y = y - np.mean(y)
        X = X.copy('F')  # faster in fortran
        X -= X.mean(axis=0)

    # compute the correlation
    corr = np.dot(y, X)
    # XXX could use corr /= row_norms(X.T) here, but the test doesn't pass
    corr /= np.asarray(np.sqrt((X ** 2).sum(axis=0))).ravel()
    corr /= norm(y)

    # convert to p-value
    degrees_of_freedom = y.size - (2 if center else 1)
    F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
    pv = stats.f.sf(F, 1, degrees_of_freedom)
    return F, pv
示例#37
0
    def transform(self, X):
        """Compute the hidden layer activation probabilities, P(h=1|v=X).

        Parameters
        ----------
        X : {array-like, sparse matrix} shape (n_samples, n_features)
            The data to be transformed.

        Returns
        -------
        h : array, shape (n_samples, n_components)
            Latent representations of the data.
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        return self._mean_hiddens(X)
示例#38
0
    def transform(self, X):
        """Compute the hidden layer activation probabilities, P(h=1|v=X).

        Parameters
        ----------
        X : {array-like, sparse matrix} shape (n_samples, n_features)
            The data to be transformed.

        Returns
        -------
        h : array, shape (n_samples, n_components)
            Latent representations of the data.
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        return self._mean_hiddens(X)
    def fit(self, X):
        """Fit SGVB to the data

        Parameters
        ----------
        X : array-like, shape (N, n_features)
            The data that the SGVB needs to fit on

        Returns
        -------
        list_lowerbound : list of int
        list of lowerbound over time
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        [N, dimX] = X.shape
        rng = check_random_state(self.random_state)

        self._initParams(dimX, rng)
        list_lowerbound = np.array([])

        n_batches = int(np.ceil(float(N) / self.batch_size))
        batch_slices = list(gen_even_slices(n_batches * self.batch_size,
                                            n_batches, N))

        if self.verbose:
            print "Initializing gradients for AdaGrad"
        for i in xrange(10):
            self._initH(X[batch_slices[i]], rng)

        begin = time.time()
        for iteration in xrange(1, self.n_iter + 1):
            iteration_lowerbound = 0

            for batch_slice in batch_slices:
                lowerbound = self._updateParams(X[batch_slice], N, rng)
                iteration_lowerbound += lowerbound

            if self.verbose:
                end = time.time()
                print("[%s] Iteration %d, lower bound = %.2f,"
                      " time = %.2fs"
                      % (self.__class__.__name__, iteration,
                         iteration_lowerbound / N, end - begin))
                begin = end

            list_lowerbound = np.append(
                list_lowerbound, iteration_lowerbound / N)
        return list_lowerbound
    def fit(self, X, y, sample_weight=None):
        """Fit Naive Bayes classifier according to X, y

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples], optional
            Weights applied to individual samples (1. for unweighted).

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_arrays(X, y, sparse_format='csr')
        X = X.astype(np.float)
        y = column_or_1d(y, warn=True)
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # convert to float to support sample weight consistently
        Y = Y.astype(np.float64)
        if sample_weight is not None:
            Y *= array2d(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                       dtype=np.float64)
        self._count(X, Y)
        self._update_feature_log_prob()
        self._update_class_log_prior(class_prior=class_prior)
        return self
    def fit(self, X, y, sample_weight=None):
        """Fit Naive Bayes classifier according to X, y

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples], optional
            Weights applied to individual samples (1. for unweighted).

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_arrays(X, y, sparse_format='csr')
        X = X.astype(np.float)
        y = column_or_1d(y, warn=True)
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # convert to float to support sample weight consistently
        Y = Y.astype(np.float64)
        if sample_weight is not None:
            Y *= array2d(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                       dtype=np.float64)
        self._count(X, Y)
        self._update_feature_log_prob()
        self._update_class_log_prior(class_prior=class_prior)
        return self
    def fit(self, X):
        """Fit SGVB to the data

        Parameters
        ----------
        X : array-like, shape (N, n_features)
            The data that the SGVB needs to fit on

        Returns
        -------
        list_lowerbound : list of int
        list of lowerbound over time
        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float)
        [N, dimX] = X.shape
        rng = check_random_state(self.random_state)

        self._initParams(dimX, rng)
        list_lowerbound = np.array([])

        n_batches = int(np.ceil(float(N) / self.batch_size))
        batch_slices = list(
            gen_even_slices(n_batches * self.batch_size, n_batches, N))

        if self.verbose:
            print "Initializing gradients for AdaGrad"
        for i in xrange(10):
            self._initH(X[batch_slices[i]], rng)

        begin = time.time()
        for iteration in xrange(1, self.n_iter + 1):
            iteration_lowerbound = 0

            for batch_slice in batch_slices:
                lowerbound = self._updateParams(X[batch_slice], N, rng)
                iteration_lowerbound += lowerbound

            if self.verbose:
                end = time.time()
                print(
                    "[%s] Iteration %d, lower bound = %.2f,"
                    " time = %.2fs" % (self.__class__.__name__, iteration,
                                       iteration_lowerbound / N, end - begin))
                begin = end

            list_lowerbound = np.append(list_lowerbound,
                                        iteration_lowerbound / N)
        return list_lowerbound
示例#43
0
 def transform(self, X):
     """Scaling features of X according to feature_range.
     Parameters
     ----------
     X : array-like with shape [n_samples, n_features]
     Input data that will be transformed.
     """
     X = check_arrays(X, sparse_format="csc", copy=self.copy)[0]
     if sparse.issparse(X):
         for i in range(X.shape[1]):
             X.data[X.indptr[i]:X.indptr[i + 1]] *= self.scale_[i]
             X.data[X.indptr[i]:X.indptr[i + 1]] += self.min_[i]
     else:
         X *= self.scale_
         X += self.min_
     return X
    def transform(self, X, y=None, copy=None):
        """
	Perform standardization by calculating percentile within trained data.

	Parameters
	----------
	X : array-like with shape [n_samples, n_features]
	The data used to scale along the features axis.
	"""
        copy = copy if copy is not None else self.copy
        X = check_arrays(X, copy=copy, sparse_format="csr")[0]
        if sp.issparse(X):
	    #TODO: implement for sparse arrays
	    pass
        else:
	    return (self.tform_func(X)/100)
示例#45
0
    def fit(self, X, y=None):
        """Fit the model to the data X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        Returns
        -------
        self : BernoulliRBM
            The fitted model.
        """
        X, = check_arrays(X, sparse_format='csc', dtype=np.float)
        n_samples = X.shape[0]
        rng = check_random_state(self.random_state)

        self.components_ = np.asarray(
            rng.normal(0, 0.01, (self.n_components, X.shape[1])),
            order='fortran')
        self.intercept_hidden_ = np.zeros(self.n_components, )
        self.intercept_visible_ = np.zeros(X.shape[1], )
        self.h_samples_ = np.zeros((self.batch_size, self.n_components))

        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        batch_slices = list(gen_even_slices(n_batches * self.batch_size,
                                            n_batches))
        verbose = self.verbose
        for iteration in xrange(self.n_iter):
            pl = 0.
            if verbose:
                begin = time.time()

            for batch_slice in batch_slices:
                pl_batch = self._fit(X[batch_slice], rng)

                if verbose:
                    pl += pl_batch.sum()

            if verbose:
                pl /= n_samples
                end = time.time()
                print("Iteration %d, pseudo-likelihood = %.2f, time = %.2fs"
                      % (iteration, pl, end - begin))

        return self
    def fit(self, X, y, mask=None):
        """Fit Gaussian Naive Bayes according to X, y

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        mask : array-like, shape = [n_samples, n_features]
            Binary, 1 at unobserved features.

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_arrays(X, y, sparse_format='dense')

        n_samples, n_features = X.shape

        if n_samples != y.shape[0]:
            raise ValueError("X and y have incompatible shapes")

        if mask is not None:
            mask = array2d(mask)
            X = X.copy()
            X[mask] = np.nan

        self.classes_ = unique_y = np.unique(y)
        n_classes = unique_y.shape[0]

        self.theta_ = np.zeros((n_classes, n_features))
        self.sigma_ = np.zeros((n_classes, n_features))
        self.class_prior_ = np.zeros(n_classes)
        self._n_ij = []
        epsilon = 1e-9
        for i, y_i in enumerate(unique_y):
            self.theta_[i, :] = bn.nanmean(X[y == y_i, :], axis=0)
            self.sigma_[i, :] = bn.nanvar(X[y == y_i, :], axis=0) + epsilon
            self.class_prior_[i] = np.float(np.sum(y == y_i)) / n_samples
            self._n_ij.append(-0.5 * np.sum(np.log(np.pi * self.sigma_[i, :])))
        self._logprior = np.log(self.class_prior_)
        return self
示例#47
0
    def fit(self, X, y=None):
        """Fit the model to the data X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        Returns
        -------
        self : BernoulliRBM
            The fitted model.
        """
        X, = check_arrays(X, sparse_format='csc', dtype=np.float)
        n_samples = X.shape[0]
        rng = check_random_state(self.random_state)
        self.components_ = np.asarray(rng.normal(
            0, 0.01, (self.n_components, X.shape[1])),
                                      order='fortran')
        self.intercept_hidden_ = np.zeros(self.n_components, )
        self.intercept_visible_ = np.zeros(X.shape[1], )
        self.h_samples_ = np.zeros((self.batch_size, self.n_components))

        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        batch_slices = list(
            gen_even_slices(n_batches * self.batch_size, n_batches))
        verbose = self.verbose
        for iteration in xrange(self.n_iter):
            pl = 0.
            if verbose:
                begin = time.time()

            for batch_slice in batch_slices:
                pl_batch = self._fit(X[batch_slice], rng)

                if verbose:
                    pl += pl_batch.sum()

            if verbose:
                pl /= n_samples
                end = time.time()
                print("Iteration %d, pseudo-likelihood = %.2f, time = %.2fs" %
                      (iteration, pl, end - begin))

        return self
示例#48
0
文件: bag.py 项目: orazaro/kgml
    def predict_proba(self, X):
        """Predict class probabilities for X.

The predicted class probabilities of an input sample is computed as
the mean predicted class probabilities of the base estimators in the
ensemble. If base estimators do not implement a ``predict_proba``
method, then it resorts to voting and the predicted class probabilities
of a an input sample represents the proportion of estimators predicting
each class.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
p : array of shape = [n_samples, n_classes]
The class probabilities of the input samples. Classes are
ordered by arithmetical order.
"""
        # Check data
        X, = check_arrays(X)

        if self.n_features_ != X.shape[1]:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is {0} and "
                             "input n_features is {1}."
                             "".format(self.n_features_, X.shape[1]))

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self)

        all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_predict_proba)(
                self.estimators_[starts[i]:starts[i + 1]],
                self.estimators_features_[starts[i]:starts[i + 1]],
                X,
                self.n_classes_)
            for i in range(n_jobs))

        # Reduce
        proba = sum(all_proba) / self.n_estimators

        return proba
示例#49
0
    def fit(self, X, y, mask=None):
        """Fit Gaussian Naive Bayes according to X, y

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        mask : array-like, shape = [n_samples, n_features]
            Binary, 1 at unobserved features.

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_arrays(X, y, sparse_format='dense')

        n_samples, n_features = X.shape

        if n_samples != y.shape[0]:
            raise ValueError("X and y have incompatible shapes")

        if mask is not None:
            mask = array2d(mask)
            X = X.copy()
            X[mask] = np.nan

        self.classes_ = unique_y = np.unique(y)
        n_classes = unique_y.shape[0]

        self.theta_ = np.zeros((n_classes, n_features))
        self.sigma_ = np.zeros((n_classes, n_features))
        self.class_prior_ = np.zeros(n_classes)
        self._n_ij = []
        epsilon = 1e-9
        for i, y_i in enumerate(unique_y):
            self.theta_[i, :] = bn.nanmean(X[y == y_i, :], axis=0)
            self.sigma_[i, :] = bn.nanvar(X[y == y_i, :], axis=0) + epsilon
            self.class_prior_[i] = np.float(np.sum(y == y_i)) / n_samples
            self._n_ij.append(-0.5 * np.sum(np.log(np.pi * self.sigma_[i, :])))
        self._logprior = np.log(self.class_prior_)
        return self
示例#50
0
    def compute(self, y_true, proba, sample_weight=None):
        """
        Compute metric for each possible prediction threshold

        :param y_true: array-like true labels
        :param proba: array-like of shape [n_samples, 2] with predicted probabilities
        :param sample_weight: array-like weight

        :rtype: tuple(array, array)
        :return: thresholds and corresponding metric values
        """
        y_true, proba, sample_weight = check_arrays(y_true, proba, sample_weight)
        pred = proba[:, self.signal_label]
        b, s, thresholds = roc_curve(y_true == self.signal_label, pred, sample_weight=sample_weight)

        metric_values = self.metric(s * self.expected_s, b * self.expected_b)
        thresholds = numpy.clip(thresholds, pred.min() - 1e-6, pred.max() + 1e-6)
        return thresholds, metric_values
def syn_counts(n_samples=50, offset=0.0, xv=(1., -0.5, 1.0), random_state=None):
    """Synthetic count data generator with len(xv) - 1 features.

    Returns
    -------
    X : np.array, shape=(n_samples, len(xv) - 1)
        The features
    y = np.array, shape=(n_samples,)
        The response
    """
    rs = check_random_state(random_state)
    xv, = check_arrays(xv)
    p = xv.shape[0] - 1
    X = np.c_[np.ones(n_samples), rs.normal(size=n_samples * p).reshape((n_samples, p))]
    xb = np.dot(X, xv)
    exb = np.exp(xb + offset)
    py = rs.poisson(lam=exb, size=n_samples)
    return X[:, 1:], py
示例#52
0
文件: stacking.py 项目: luoq/datatrek
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"):
    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    pred = Parallel(n_jobs=n_jobs)(
        delayed(_cross_val_predict)(
            clone(estimator), X, y, train, test, predict_fun)
        for train, test in cv)
    pred = np.concatenate(pred)
    if cv.indices:
        index = np.concatenate([test for _, test in cv])
    else:
        index = np.concatenate([np.where(test)[0] for _, test in cv])
    ## pred[index] = pred doesn't work as expected
    pred[index] = pred.copy()
    if refit:
        return pred, clone(estimator).fit(X,y)
    else:
        return pred
示例#53
0
文件: bag.py 项目: orazaro/kgml
    def decision_function(self, X):
        """Average of the decision functions of the base classifiers.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
score : array, shape = [n_samples, k]
The decision function of the input samples. The columns correspond
to the classes in sorted order, as they appear in the attribute
``classes_``. Regression and binary classification are special
cases with ``k == 1``, otherwise ``k==n_classes``.

"""
        # Trigger an exception if not supported
        if not hasattr(self.base_estimator_, "decision_function"):
            raise NotImplementedError

        # Check data
        X, = check_arrays(X)

        if self.n_features_ != X.shape[1]:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is {1} and "
                             "input n_features is {2} "
                             "".format(self.n_features_, X.shape[1]))

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self)

        all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_decision_function)(
                self.estimators_[starts[i]:starts[i + 1]],
                self.estimators_features_[starts[i]:starts[i + 1]],
                X)
            for i in range(n_jobs))

        # Reduce
        decisions = sum(all_decisions) / self.n_estimators

        return decisions
示例#54
0
    def predict(self, X):
        """Predict class for X.

        The predicted class of an input sample is computed as the predicted
        class of the underlying estimator.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        Returns
        -------
        y : array of shape = [n_samples]
            The predicted classes.
        """
        X = check_arrays(X)
        return self.estimator_.predict(X)
示例#55
0
    def predict_proba(self, X):
        """Predict class probabilities for X.

The predicted class probabilities of an input sample is computed as
the mean predicted class probabilities of the base estimators in the
ensemble. If base estimators do not implement a ``predict_proba``
method, then it resorts to voting and the predicted class probabilities
of a an input sample represents the proportion of estimators predicting
each class.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
p : array of shape = [n_samples, n_classes]
The class probabilities of the input samples. Classes are
ordered by arithmetical order.
"""
        # Check data
        X, = check_arrays(X)

        if self.n_features_ != X.shape[1]:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is {0} and "
                             "input n_features is {1}."
                             "".format(self.n_features_, X.shape[1]))

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self)

        all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_predict_proba)
            (self.estimators_[starts[i]:starts[i + 1]], self.
             estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_)
            for i in range(n_jobs))

        # Reduce
        proba = sum(all_proba) / self.n_estimators

        return proba
示例#56
0
    def decision_function(self, X):
        """Average of the decision functions of the base classifiers.

Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.

Returns
-------
score : array, shape = [n_samples, k]
The decision function of the input samples. The columns correspond
to the classes in sorted order, as they appear in the attribute
``classes_``. Regression and binary classification are special
cases with ``k == 1``, otherwise ``k==n_classes``.

"""
        # Trigger an exception if not supported
        if not hasattr(self.base_estimator_, "decision_function"):
            raise NotImplementedError

        # Check data
        X, = check_arrays(X)

        if self.n_features_ != X.shape[1]:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is {1} and "
                             "input n_features is {2} "
                             "".format(self.n_features_, X.shape[1]))

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self)

        all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_decision_function)(
                self.estimators_[starts[i]:starts[i + 1]],
                self.estimators_features_[starts[i]:starts[i + 1]], X)
            for i in range(n_jobs))

        # Reduce
        decisions = sum(all_decisions) / self.n_estimators

        return decisions