Exemplo n.º 1
0
    def __init__(self, Xin, n_components=None, fit_type='sklearn'):
        self.n_samples, self.n_features = Xin.shape

        # Center data
        self.mean = np.mean(Xin, axis=0)
        X = Xin - self.mean

        if n_components is None:
            self.n_components = min(self.n_features,self.n_samples)
        elif not 0 <= n_components <= self.n_features:
            raise ValueError("n_components=%r invalid for n_features=%d" % (n_components, self.n_features))
        else:
            self.n_components = n_components

        self.components = None
        self.variance = None
        self.ll = None
        self.n_iters = None

        fa = FactorAnalysis(n_components=n_components)
        if fit_type=='sklearn':
            fa.fit(X)
            self.components = fa.components_
            self.variance = fa.noise_variance_
            self.ll = fa.loglike_
            self.n_iters = fa.n_iter_
        else:
            self.my_fit(X)
Exemplo n.º 2
0
def SlumIndex(rates):
	#calculate Slum index

	#PCA with 4 components
	# pca = PCA(n_components=4)
	# pca.fit(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']])

	# #Here we STORE PARAMETER FOR SLUM IMPACT using the weights and vectors from pca
	# rates['SlumIndex'] = zeros(len(rates))
	# weights=pca.explained_variance_ratio_
	# #pca.components_ are the transformation vectors, rates are the original ones
	# new_vectors = dot(transpose(pca.components_), transpose(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']].values))
	
	# #Finally we get the index with the eigenvalues
	# rates['SlumIndex'] = transpose(dot(weights,new_vectors))

	# pca = PCA(n_components=4)
	# new_vectors = pca.fit_transform(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']])
	# rates['SlumIndex'] = dot(pca.explained_variance_ratio_,transpose(new_vectors))
	facAn = FactorAnalysis(n_components = 1)
	facAn.fit(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']])
	rates['SlumIndex'] = dot(facAn.components_**2,transpose(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']].values))[0]
	
	# rates['SlumIndex'] = rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']].values.sum(axis=1)
	return rates[['ID','SlumIndex']]
def fit_factor_analysis(percentage=0.8):
    """
    Runs the factor analysis.

    Parameters:

        percentage: float, default:0.8

        The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis.

    Returns:
        
        X: array of floats [n_samples,n_factors]

            The transformed data after the factor analysis.

        components: array of floats [n_factors,n_samples]

            The components of the factor analysis
    """
    fa = FactorAnalysis()
    fa.fit(data)
    C = fa.get_covariance()
    l,e = np.linalg.eigh(C)
    cs = np.cumsum(l[::-1])/np.sum(l)
    n = np.sum(cs<percentage)

    fa.n_components = n
    X_ = fa.fit_transform(data)
    components = fa.components_
    return X_,components
Exemplo n.º 4
0
class MyRegressor(ModelDesign):
    def __init__(self, data):
        ModelDesign.__init__(self, data)
        self.reduction = PCA(n_components=150)
        self.reduction = FactorAnalysis(n_components=150)
        self.model = svm.NuSVR()

    def train(self):
        data = self.data
        X, Y = data.getXY(data.trainSize)
        X = np.array(X, dtype=np.float32)
        Y = np.array(Y, dtype=np.float32)
        Y = np.reshape(Y, newshape=Y.size)
        print("running", self.model, "regressor for", self.name)
        t1 = time.time()
        self.reduction.fit(X)
        X = self.reduction.transform(X)
        self.model = self.model.fit(X, Y)
        t2 = time.time()
        print("finished in", t2 - t1, "s")
        X, Y = data.getTestData()
        if X is not None:
            X = self.reduction.transform(X)
            Y1 = self.model.predict(X)
            Y = np.reshape(Y, newshape=Y.size)
            loss = np.sqrt(np.mean(np.square(Y1 - Y)))
            print("test RMSE=", loss)

    def predict(self, x):
        x = self.reduction.transform(x)
        y = self.model.predict(x)
        Y = np.reshape(y, newshape=(y.size, 1))
        #Y=self.data.rescale(Y)
        return Y
Exemplo n.º 5
0
def get_factors(shoppers, n_components=4, random_state=903, **kwargs):
    """
    Find Factors to represent the shopper-level features in compressed space.
    These factors will be used to map simplified user input from application
    to the full feature space used in modeling.

    Args:
        shoppers (pd.DataFrame): full set of shoppers in feature data (train + test)
        n_components (int): number of factors to mine. Defaults to 4 and should stay that way (application
                            UI based on these 4 analyzed factors)
        random_state (int): sets random state for factor analysis algorithm. Defaults to 4 (and should stay that way)
        kwargs: additional keyword arguments for sklearn.decomposition.FactorAnalysis

    Returns:
        pd.DataFrame: will have n_components rows and n_features columns. The values of this matrix
                      can be used to map factors to full feature set (on std normal scale).

    """
    # Remove columns which should not be considered in factor analysis
    x = shoppers
    for col in ['user_id', 'n_orders', 'label']:
        if col in x.columns:
            x = x.drop(columns=col)

    # Need to scale data as columns on incommensurate scales
    cols = x.columns
    x = preprocessing.scale(x)
    fa = FactorAnalysis(n_components, random_state=random_state, **kwargs)
    fa.fit(x)
    return pd.DataFrame(fa.components_, columns=cols)
Exemplo n.º 6
0
def factor_analysis(data, num_features, components=1):
    print('-- Model: FactorAnalysis, numcomp: %d --' % components)
    fa = FactorAnalysis(n_components=components, random_state = 1)
    X  = np.reshape(np.stack(data, axis=0), (-1,num_features)) #reshape to (data_size,num_vars) 
    fa.fit(X)

    return fa
Exemplo n.º 7
0
def test_factor_analysis():
    """Test FactorAnalysis ability to recover the data covariance structure
    """
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise
    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score(X).sum())

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always', ConvergenceWarning)
        fa1.max_iter = 1
        fa1.verbose = True
        fa1.fit(X)
        assert_true(w[-1].category == ConvergenceWarning)

        warnings.simplefilter('always', DeprecationWarning)
        FactorAnalysis(verbose=1)
        assert_true(w[-1].category == DeprecationWarning)
def rotated_scaled_fa(n_comp, arr_pq,varimax_=True):
    '''Perform factor analysis on a matrix
    IN:
    - n_comp, int, number of latent dimensions
    - arr_pq, arr, shape:  samples (persons) x features (questions)
    - varimax_, bool, whether to perform a varimax rotation (default=True)
    OUT:
    - arr_qd, arr, shape: features x latent-dimension
    - arr_pd, arr, shape: samples x latent dimensions
    '''
    fa = FactorAnalysis(n_comp)
    fa.fit(arr_pq)
    
    arr_pd = fa.transform(arr_pq)
    arr_qd = fa.components_.T

    ## do the varimax-rotation
    if varimax_ == True:
	    arr_dp = np.transpose(arr_pd)
	    
	    L1,T= fr.rotate_factors(arr_qd,'varimax')
	    arr_qd_new = np.dot(arr_qd,T)
	    
	    T_m1 = np.linalg.inv(T)

	    arr_pd_new = np.dot(T_m1,arr_dp)
	    arr_pd_new = np.transpose(arr_pd_new)
	    
	    return arr_qd_new, arr_pd_new
    else:
    	return arr_qd, arr_pd
Exemplo n.º 9
0
    def initialize(self):
        """
        Initialize the model.
        """
        # inverse variance weighted mean
        if np.sum(self.obsvar) != 0.0:
            self.mean = np.sum(self.data / self.obsvar, axis=0) / \
                np.sum(1.0 / self.obsvar, axis=0)
        else:
            self.mean = np.mean(self.data, axis=0)

        # use Factor Analysis to initialize factor loadings
        if self.M == 0:
            self.lam = np.zeros(1)
        else:
            fa = FactorAnalysis(n_components=self.M)
            fa.fit(self.data)
            self.lam = fa.components_.T

        # initialize jitter
        if self.jtype is None:
            self.jitter = np.array([])
        elif self.jtype is 'one':
            self.jitter = 0.0
        else:
            self.jitter = np.zeros(self.D)

        # save a copy
        self.initial_mean = self.mean.copy()
        self.initial_jitter = self.jitter.copy()
        self.initial_lambda = self.lam.copy()
Exemplo n.º 10
0
Arquivo: EM.py Projeto: jwubz123/5470-
def initializing(Y, K, singleSigma=False):
    N, D = Y.shape
    model = FactorAnalysis(n_components=K)
    zeroedY = deepcopy(Y)
    mus = np.zeros([D, 1])

    for j in range(D):
        mus[j] = zeroedY[:, j].mean()
        zeroedY[:, j] = zeroedY[:, j] - mus[j]

    model.fit(zeroedY)

    A = model.components_.transpose()
    sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose()
    if singleSigma:
        sigmas = np.mean(sigmas) * np.ones(sigmas.shape)

    means = []
    ps = []
    for j in range(D):
        non_zero_idxs = np.abs(Y[:, j]) > 1e-6
        means.append(Y[non_zero_idxs, j].mean())
        ps.append(1 - non_zero_idxs.mean())

        lamb, pcov = curve_fit(exp_lam, means, ps, p0=.05)
        lamb = lamb[0]

    return A, mus, sigmas, lamb
Exemplo n.º 11
0
    def initialize(self):
        """
        Initialize the model.
        """
        # inverse variance weighted mean
        if np.sum(self.obsvar) != 0.0:
            self.mean = np.sum(self.data / self.obsvar, axis=0) / \
                np.sum(1.0 / self.obsvar, axis=0)
        else:
            self.mean = np.mean(self.data, axis=0)

        # use Factor Analysis to initialize factor loadings
        if self.M == 0:
            self.lam = np.zeros(1)
        else:
            fa = FactorAnalysis(n_components=self.M)
            fa.fit(self.data)
            self.lam = fa.components_.T

        # initialize jitter
        if self.jtype is None:
            self.jitter = np.array([])
        elif self.jtype is 'one':
            self.jitter = 0.0
        else:
            self.jitter = np.zeros(self.D)

        # save a copy
        self.initial_mean = self.mean.copy()
        self.initial_jitter = self.jitter.copy()
        self.initial_lambda = self.lam.copy()
Exemplo n.º 12
0
class FactorAnalysis():
    def __init__(self, cols, n_components):
        self.n_components = n_components
        self.model = FactorAnalysis(n_components=n_components)
        self.columns = cols

    def fit(self, data):
        self.model.fit(data[self.columns])

    def fit_transform(self, data):
        transformed = self.model.fit_transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["fa_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data

    def transform(self, data):
        transformed = self.model.transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["fa_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data
Exemplo n.º 13
0
def aic(mm):
    aic = []
    for i in range(1, 10):
        fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000)
        fa.fit(mm)
        d = n * i
        b = 100 * fa.score(mm) - d
        aic.append(b)
    return aic
Exemplo n.º 14
0
    def runFA(self):
        print("Starting FA")
        print("Dimensionality reduction")
        numFeatures = 30
        if (self.dataset == "otto"):
            numFeatures = 93
        n_components = range(1, numFeatures + 1)

        decisiontree = DecisionTreeClassifier(criterion='gini',
                                              max_depth=15,
                                              min_samples_split=5)
        fa = FactorAnalysis(max_iter=1000)

        pipe = Pipeline(steps=[('fa', fa), ('decisionTree', decisiontree)])

        # Plot the fa spectrum
        fa.fit(self.dataX)
        X = fa.components_
        import numpy as np
        centered_matrix = X - X.mean(axis=1)[:, np.newaxis]
        cov = np.dot(centered_matrix, centered_matrix.T)
        eigvals, eigvecs = np.linalg.eig(cov)
        best_n = 11
        if (self.dataset == "otto"):
            best_n = 30

        self.plotFAGraph(n_components, eigvals, best_n)

        fig, ax = plt.subplots()
        ax.bar(n_components, eigvals, linewidth=2, color='blue')
        plt.axis('tight')
        plt.xlabel('n_components')
        ax.set_ylabel('Eigen Values')

        gridSearch = GridSearchCV(pipe,
                                  dict(fa__n_components=n_components),
                                  cv=3)
        gridSearch.fit(self.dataX, self.dataY)
        results = gridSearch.cv_results_
        ax1 = ax.twinx()

        #Plotting the accuracies and best component
        ax1.plot(results['mean_test_score'],
                 linewidth=2,
                 color='red',
                 label="CV score")
        ax1.set_ylabel('Mean Cross Validation Accuracy')
        ax1.axvline(best_n,
                    linestyle=':',
                    label='best n_components = %s' % (str(best_n)),
                    linewidth=2)

        plt.legend(prop=dict(size=12), loc="upper right")
        plt.title("Accuracy of DT and Eigen Values of Latent Variables [" +
                  self.dataset + "]")
        plt.savefig("./fa/" + self.dataset + "_best-n_components.png")
        plt.close()
Exemplo n.º 15
0
def run_fa(dataset, min_components, max_components):

    X, y = load_dataset(dataset)
    data = X

    n_samples, n_features = data.shape
    n_labels = len(np.unique(y))
    labels = y

    results = []

    for n_components in range(min_components, max_components):
        print('n_components: ', n_components)

        for svd_method in ['lapack', 'randomized']:

            scores = []
            data = X.copy()
            fa = FactorAnalysis(n_components=n_components,
                                svd_method=svd_method,
                                random_state=random_state)

            t0 = time()
            fa.fit(X)

            scores.append(n_components)
            scores.append(svd_method)
            scores.append(time() - t0)
            scores.append(fa.score(X))

            results.append(scores)

    # N-Components vs Log Likelihood
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[3],
                 y_axis_label='Log Liklihood',
                 title=dataset.title() + ': FactorAnalysis',
                 filename='-'.join(['fa', dataset, 'loglike']))

    # N-Components vs Time
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[2],
                 y_axis_label='Time',
                 title=dataset.title() + ': FactorAnalysis',
                 filename='-'.join(['fa', dataset, 'time']))

    results = np.array(results)
    np.savetxt('output-csv/' + ('-'.join([dataset, 'fa.csv'])),
               results,
               delimiter=",",
               fmt="%s")
Exemplo n.º 16
0
def bic(mm):
    bic = []
    for i in range(1, 10):
        fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000)
        fa.fit(mm)
        d = n * i
        b = 100 * fa.score(mm) - (math.log(100) * d) / 2
        bic.append(b)
    return bic
def get_inv_diag_plus_low_rank_cov_op(X, rank=2):
    fa = FactorAnalysis(n_components=rank)
    fa.fit(X)
    components = fa.components_
    noise_vars = fa.noise_variance_
    activations = fa.transform(X)

    return _woodbury_inverse(_diagonal_operator(1. / noise_vars),
                 aslinearoperator(np.linalg.inv(1. / len(activations) * 
                                  activations.T.dot(activations))),
                 components.T, components)
Exemplo n.º 18
0
def factor_analysis_method(train_x,
                           train_y,
                           validate_x,
                           validate_y,
                           fa_threshold,
                           is_split=1):
    # 缺失值填充
    train_x = train_x.fillna(0)
    train_x = train_x.values
    validate_x = validate_x.fillna(0)
    validate_x = validate_x.values

    # 归一化,之前必须保证没有空值,之后自动变成ndarray
    # scaler = MinMaxScaler()
    # train_x = scaler.fit_transform(train_x)
    # validate_x = scaler.fit_transform(validate_x)

    # dataframe变成没有标签的ndarray,以便可以输入模型
    train_y = train_y.values
    validate_y = validate_y.values

    if is_split == 1:
        # 先把onehot列单独拿出来
        onehot_train_x_left = train_x[:, :30]
        train_x_mid = train_x[:, 30:454]
        # onehot_train_x_right = train_x[:, 454:]
        onehot_validate_x_left = validate_x[:, :30]
        validate_x_mid = validate_x[:, 30:454]
        # onehot_validate_x_right = validate_x[:, 454:]
    else:
        train_ts_code_1 = train_x[:, 0]
        train_x_mid = train_x[:, 1:]
        valid_ts_code_1 = validate_x[:, 0]
        validate_x_mid = validate_x[:, 1:]

    # factor_analysis
    fa = FactorAnalysis(n_components=fa_threshold)
    selected_train_x = fa.fit(train_x_mid).transform(train_x_mid)
    selected_validate_x = fa.fit(validate_x_mid).transform(validate_x_mid)

    # 把ts_code再重新拼回来
    if is_split == 1:  #ts_code有30列
        selected_train_x = np.hstack((onehot_train_x_left, selected_train_x))
        selected_validate_x = np.hstack(
            (onehot_validate_x_left, selected_validate_x))
    else:  #ts_code只有一列
        # print(train_ts_code_1.reshape(-1,1).shape)
        # print(selected_train_x.shape)
        selected_train_x = np.hstack(
            (train_ts_code_1.reshape(-1, 1), selected_train_x))
        selected_validate_x = np.hstack(
            (valid_ts_code_1.reshape(-1, 1), selected_validate_x))

    return selected_train_x, train_y, selected_validate_x, validate_y
Exemplo n.º 19
0
 def main_loop(self):
     self.aic_score = np.zeros(2 * self.M + 1)
     self.bic_score = np.zeros(2 * self.M + 1)
     for i in range(self.real_m - self.M, self.real_m + self.M + 1):
         self.m = i
         fa_model = FactorAnalysis(n_components=self.m)
         fa_model.fit(self.x)
         self.log_likelihood = fa_model.score(self.x) * self.N
         self.aic_score[i - self.real_m + self.M] = self.AIC()
         self.bic_score[i - self.real_m + self.M] = self.BIC()
     if self.verbose:
         self.show_line()
Exemplo n.º 20
0
def initializeParams(Y, K, singleSigma=False, makePlot=False):
    """
	initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting.
	Checked.
	Input:
	Y: data matrix, n_samples x n_genes
	K: number of latent components
	singleSigma: uses only a single sigma as opposed to a different sigma for every gene
	makePlot: makes a mu - p_0 plot and shows the decaying exponential fit.
	Returns:
	A, mus, sigmas, decay_coef: initialized model parameters.
	"""

    N, D = Y.shape
    model = FactorAnalysis(n_components=K)
    zeroedY = deepcopy(Y)
    mus = np.zeros([D, 1])

    for j in range(D):
        non_zero_idxs = np.abs(Y[:, j]) > 1e-6
        mus[j] = zeroedY[:, j].mean()
        zeroedY[:, j] = zeroedY[:, j] - mus[j]

    model.fit(zeroedY)

    A = model.components_.transpose()
    sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose()
    if singleSigma:
        sigmas = np.mean(sigmas) * np.ones(sigmas.shape)

    # Now fit decay coefficient
    means = []
    ps = []
    for j in range(D):
        non_zero_idxs = np.abs(Y[:, j]) > 1e-6
        means.append(Y[non_zero_idxs, j].mean())
        ps.append(1 - non_zero_idxs.mean())

    decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05)
    decay_coef = decay_coef[0]

    mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means)**2))))

    if (mse > 0) and makePlot:
        from matplotlib.pyplot import figure, scatter, plot, title, show
        figure()
        scatter(means, ps)
        plot(np.arange(min(means), max(means), .1),
             np.exp(-decay_coef * (np.arange(min(means), max(means), .1)**2)))
        title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse))
        show()

    return A, mus, sigmas, decay_coef
Exemplo n.º 21
0
def initializeParams(Y, K, singleSigma=False, makePlot=False):
	"""
	initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting.
	Checked.
	Input:
	Y: data matrix, n_samples x n_genes
	K: number of latent components
	singleSigma: uses only a single sigma as opposed to a different sigma for every gene
	makePlot: makes a mu - p_0 plot and shows the decaying exponential fit.
	Returns:
	A, mus, sigmas, decay_coef: initialized model parameters.
	"""

	N, D = Y.shape
	model = FactorAnalysis(n_components=K)
	zeroedY = deepcopy(Y)
	mus = np.zeros([D, 1])

	for j in range(D):
		non_zero_idxs = np.abs(Y[:, j]) > 1e-6
		mus[j] = zeroedY[:, j].mean()
		zeroedY[:, j] = zeroedY[:, j] - mus[j]

	model.fit(zeroedY)

	A = model.components_.transpose()
	sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose()
	if singleSigma:
		sigmas = np.mean(sigmas) * np.ones(sigmas.shape)

	# Now fit decay coefficient
	means = []
	ps = []
	for j in range(D):
		non_zero_idxs = np.abs(Y[:, j]) > 1e-6
		means.append(Y[non_zero_idxs, j].mean())
		ps.append(1 - non_zero_idxs.mean())

	decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05)
	decay_coef = decay_coef[0]

	mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2))))

	if (mse > 0) and makePlot:
		from matplotlib.pyplot import figure, scatter, plot, title, show
		figure()
		scatter(means, ps)
		plot(np.arange(min(means), max(means), .1), np.exp(-decay_coef * (np.arange(min(means), max(means), .1) ** 2)))
		title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse))
		show()

	return A, mus, sigmas, decay_coef
def main():
    print ("Running CV on Log Likelihood approach.")
    LL()

    start_time = time.time()
    totalX = []
    totalY = []
    flag = True
    countTrain = 0
    print ("\n\nNow testing on separate data.")
    with open("creditcard.csv", "rb") as f:
        data = csv.reader(f)
        for row in data:
            if flag:
                flag = False
                continue
            countTrain += 1
            if countTrain > 228000:          #CV on 80% of data
                totalX.append([float(i) for i in row[:-1]])
                totalY.append(int(row[-1]))

    #newTotalX = np.fft.fft(totalX)
    totalX = scalar.fit_transform(totalX)
    print ("Data Loaded")
    clf = FactorAnalysis()
    clf.fit(totalX)
    #logLik = clf.score(totalX)
    Y = []
    llScores = clf.score_samples(totalX)						#calculates log likelihood of each sample (instead of average of whole data set)
    for i in range(len(totalY)):
        if llScores[i] > -60 and llScores[i] < -25:
            Y.append(0)
        else:
            Y.append(1)
	#prints running time of algorithm
    print("%s seconds" % (time.time() - start_time))
	#print results
    print ("Results")
    auc = roc_auc_score(totalY, Y)
    print("Area under curve : " + str(auc))
    fpr, tpr, _ = roc_curve(totalY, Y)
    print ("False Positive Rate : " + str(fpr[1]))
    _, recall, _ = precision_recall_curve(totalY, Y)
    print ("Recall : " + str(recall[1]))

	#to plot ROC curve
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.show()
    def fit(self, y):
        """Fit the GPFA model parameters to the obervations y.

        Parameters
        ----------
        y : ndarray (time, features)
        """
        if isinstance(y, np.ndarray) and y.ndim == 2:
            y = [y]
        y_all = np.concatenate(y)
        self.mean_ = y_all.mean(axis=0, keepdims=True)
        y = [yi - self.mean_ for yi in y]
        n = y[0].shape[1]
        T = [yi.shape[0] for yi in y]
        model = FA(self.n_factors, svd_method='lapack')
        model.fit(y_all)

        self.R_ = np.diag(model.noise_variance_)
        self.C_ = model.components_.T
        self.d_ = np.zeros(n)
        self.tau_ = self.tau_init + self.rng.rand(self.n_factors)
        # Allocated and reuse these
        C = self.C_
        R = self.R_
        big_K = {
            Ti: calc_big_K(Ti, self.n_factors, self.tau_, self.var_n)
            for Ti in set(T)
        }
        y_cov = {
            Ti: block_dot_B(block_dot_A(C, big_K[Ti], Ti), C.T, Ti) +
            make_block_diag(R, Ti)
            for Ti in set(T)
        }
        big_d = {Ti: np.tile(self.d_, Ti) for Ti in set(T)}
        big_y = [yi.ravel() for yi in y]
        ll_pre = log_likelihood(big_d, y_cov, big_y, T)
        if self.verbose:
            print("FA log likelihood:", ll_pre)

        converged = False
        for ii in range(self.max_iter):
            ll = self._em_iter(y, big_K)
            if abs(ll - ll_pre) / np.amax([abs(ll), abs(ll_pre), 1.
                                           ]) <= self.tol:
                converged = True
                break
            ll_pre = ll
        if not converged:
            warnings.warn("EM max_iter reached.", ConvergenceWarning)
        return self
Exemplo n.º 24
0
def factor_analysis_dimensionality_score(data_in, dimensions, nfold, maxiter=1000, verbose=False):
    '''
    Estimate the latent dimensionality of an input dataset by appling cross validated 
    factor analysis (FA) to input data and returning the maximum likelihood values. 
    
    Args:
        data_in (nt, nch): Time series data in
        dimensions (ndim): 1D Array of dimensions to compute FA for 
        nfold (int): Number of cross validation folds to compute. Must be >= 1
        maxiter (int): Maximum number of FA iterations to compute if there is no convergence. Defaults to 1000.
        verbose (bool): Display % of dimensions completed. Defaults to False

    Returns:
        tuple: Tuple containing:
            | **log_likelihood_score (ndim, nfold):** Array of MLE FA score for each dimension for each fold
            | **iterations_required (ndim, nfold):** How many iterations of FA were required to converge for each fold
    '''

    # Initialize arrays
    log_likelihood_score = np.zeros((np.max(np.shape(dimensions)), nfold))
    iterations_required = np.zeros((np.max(np.shape(dimensions)), nfold))

    if verbose == True:
        print('Cross validating and fitting ...')

    # Compute the maximum likelihood score for each dimension using factor analysis    
    for dim_idx in range(len(dimensions)):
        fold_idx = 0

        # Handle the case without cross validation.
        if nfold == 1:
            fa = FactorAnalysis(n_components=dimensions[dim_idx], max_iter=maxiter)
            fafit = fa.fit(data_in.T)
            log_likelihood_score[dim_idx, fold_idx] = fafit.score(data_in.T)
            iterations_required[dim_idx, fold_idx] = fafit.n_iter_
            warnings.warn("Without cross validation the highest dimensional model will always fit best.")

        # Every other case with cross validation
        else:
            for trainidx, testidx in model_selection.KFold(n_splits=nfold).split(data_in.T):
                fa = FactorAnalysis(n_components=dimensions[dim_idx], max_iter=maxiter)
                fafit = fa.fit(data_in[:, trainidx].T)
                log_likelihood_score[dim_idx, fold_idx] = fafit.score(data_in[:, testidx].T)
                iterations_required[dim_idx, fold_idx] = fafit.n_iter_
                fold_idx += 1

        if verbose == True:
            print(str((100 * (dim_idx + 1)) // len(dimensions)) + "% Complete")

    return log_likelihood_score, iterations_required
Exemplo n.º 25
0
def sd_fa(fname,components,result_name):
    '''
    pca 计算
    '''
    cl_data,area_list = data_set(fname)
    values = cl_data.values
    fa = FactorAnalysis(n_components=components)
    #数据标准化
    values = preprocessing.scale(values)
    try:
        fa.fit(values)
    except Exception,e:
            logging.error("factor analysis fit error")
            sys.exit()
Exemplo n.º 26
0
class FactorAnalysisImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
def makefac(df, ncomp, csv_path, comp_name=""):
    pca_df = PCA(n_components=ncomp)
    pca_df.fit(df)
    print(pca_df.explained_variance_ratio_.sum())
    fac_df = FactorAnalysis(n_components=ncomp, svd_method='lapack')
    fac_df.fit(df)
    fac_df_df = pd.DataFrame(fac_df.components_, columns=df.columns)
    fac_df_df.to_csv(csv_path)
    new_cols = []
    for i in range(ncomp):
        new_cols.append(comp_name + '_' + str(i))
    trans_df = pd.DataFrame(fac_df.transform(df),
                            index=df.index,
                            columns=new_cols)
    return trans_df
Exemplo n.º 28
0
def gridsearch_svm(Xtrain, Ytrain, Xval, Yval):
    #---------------------------------- Scaling
    X1, scaler = scale_data(Xtrain)
    X2 = scale_data(Xval, scaler)
    #---------------------------------- Factor analysis
    fa = FactorAnalysis()
    X1 = fa.fit_transform(X1)
    X2 = fa.fit(X2)
    #---------------------------------- Cross validation and grid search
    cv = ShuffleSplit(len(Xtrain),
                      n_iter=1,
                      train_size=0.25,
                      test_size=.03,
                      random_state=0)
    params = {'C': [1, 10], 'kernel': ['rbf', 'linear']}
    svr = svm.SVC(verbose=True, shrinking=False)
    classifier = grid_search.GridSearchCV(svr, params, verbose=3, cv=cv)
    t0 = time()
    classifier.fit(X1, Ytrain)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    #---------------------------------- Prediction on validation set:
    t0 = time()
    pred = list(classifier.predict(X2))
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    if hasattr(classifier, 'coef_'):
        print("dimensionality: %d" % classifier.coef_.shape[1])
        print("density: %f" % density(classifier.coef_))
    print 'F1-score : ', f1_score(Yval, pred, average='binary')
    print("classification report:")
    print(classification_report(Yval, pred, target_names=['0', '1'], digits=4))
    print("confusion matrix:")
    print(confusion_matrix(Yval, pred))
    return classifier, scaler
Exemplo n.º 29
0
def dataTransformations(x):

    x.rename(columns={'OCUPVIVPAR': 'Dwellers'}, inplace=True)
    #water
    x['Water'] = x['VPH_AGUAFV'] / x['Houses']

    #Sanitation use VPH_EXCSA and VPH_NODREN
    x['Sanitation'] = (x['Houses'] - x['VPH_EXCSA'] +
                       x['VPH_NODREN']) / (2. * x['Houses'])

    #Overcrowding use VPH_1CUART and PRO_OCUP_C
    # x['Density'] = 1. - 1./(1. +x['PRO_OCUP_C'])
    x['Density'] = x['PRO_OCUP_C'] - 2.
    x.loc[x.Density < 0, 'Density'] = 0.
    x['Density'] = 1. - 1. / (1. + x.Density)
    x['Density'] = x['Density'] / x['Density'].max()

    #Structure VPH_1CUART and VPH_PISOTI
    x['Structure'] = (x['VPH_PISOTI'] + x['VPH_1CUART']) / (2 * x['Houses'])

    ssiData = pd.DataFrame(
        normalize(x[['Water', 'Structure', 'Density', 'Sanitation']], axis=0),
        columns=['Water', 'Structure', 'Density', 'Sanitation'])

    # x.loc[:,'Factor'] = zeros(len(x)
    facAn = FactorAnalysis(n_components=1)
    facAn.fit(ssiData)
    x.loc[:, 'Factor'] = dot(facAn.components_**2,
                             transpose(ssiData.values))[0]

    #K-Means
    k_meansX = ssiData

    # do the clustering
    k_means = KMeans(n_clusters=4)
    k_means.fit(k_meansX)
    x.loc[:, 'K_Means'] = k_means.labels_

    #linear combination

    x.loc[:,
          'LC'] = x[['Water', 'Structure', 'Sanitation'
                     ]].sum(axis=1) + (x['PRO_OCUP_C'] / x['PRO_OCUP_C'].max())

    #save x to csv
    # x.to_csv(folderPath+'dataTrans.csv')
    return x
Exemplo n.º 30
0
def initialization_point(y, J):
    """
    Run factor analysis to get a reasonable initialization point for the
    optimisation process.

    :param y:
        An array of the data that has shape (N, D) where N is the number of
        stars and D is the dimensionality of the data.

    :param J:
        The number of latent factors.

    :returns:
        A dictionary of initial values that can be fed directly to Stan.
    """

    fa = FactorAnalysis(J)
    fa.fit(y)

    # TODO: Re-order the matrix of elements such that the low absolute values 
    #       are in the upper triangular part of the matrix, and that the entries
    #       along the diagonal are positive.

    N, D = y.shape

    L, psi = (fa.components_.T, fa.noise_variance_)

    # The beta diagonal values must be positive.
    beta_diag = np.clip(L.T[np.diag_indices(J)], 0, np.inf) + 1e-3

    # A hack to get the lower triangular beta values is to set the upper
    # triangular (including the diagonal) to non-finite values then re-order
    # and flatten the array.
    beta_lower_triangular = np.copy(L)
    beta_lower_triangular[np.triu_indices_from(L, 0)] = np.nan
    beta_lower_triangular = beta_lower_triangular.T.flatten()
    _ = np.isfinite(beta_lower_triangular)
    beta_lower_triangular = beta_lower_triangular[_]

    sigma_L = np.std(beta_lower_triangular)

    init = dict(psi=psi,
                beta_diag=beta_diag,
                beta_lower_triangular=beta_lower_triangular,
                sigma_L=sigma_L)

    return init
Exemplo n.º 31
0
    def expMlpc(self):
        pca = PCA(n_components=self.pcaBest)
        pca.fit(self.pcaDataX)
        self.pcaDataX = pca.transform(self.pcaDataX)
        self.pcaTrainX, self.pcaTestX, self.pcaTrainY, self.pcaTestY = train_test_split(
            self.pcaDataX, self.pcaDataY, test_size=0.3, random_state=0)
        print(self.pcaTrainX.shape)

        ica = FastICA(n_components=self.icaBest, max_iter=1000)
        ica.fit(self.icaDataX)
        self.icaDataX = ica.transform(self.icaDataX)
        self.icaTrainX, self.icaTestX, self.icaTrainY, self.icaTestY = train_test_split(
            self.icaDataX, self.icaDataY, test_size=0.3, random_state=0)
        print(self.icaTrainX.shape)

        rp = random_projection.GaussianRandomProjection(
            n_components=self.rpBest)
        rp.fit(self.rpDataX)
        self.rpDataX = rp.transform(self.rpDataX)
        self.rpTrainX, self.rpTestX, self.rpTrainY, self.rpTestY = train_test_split(
            self.rpDataX, self.rpDataY, test_size=0.3, random_state=0)
        print(self.rpTrainX.shape)

        fa = FactorAnalysis(n_components=self.faBest, max_iter=1000)
        fa.fit(self.faDataX)
        self.faDataX = fa.transform(self.faDataX)
        self.faTrainX, self.faTestX, self.faTrainY, self.faTestY = train_test_split(
            self.faDataX, self.faDataY, test_size=0.3, random_state=0)
        print(self.faTrainX.shape)

        normalResults = self.mlpc(self.trainX, self.trainY, self.testX,
                                  self.testY)
        pcaResults = self.mlpc(self.pcaTrainX, self.pcaTrainY, self.pcaTestX,
                               self.pcaTestY)
        icaResults = self.mlpc(self.icaTrainX, self.icaTrainY, self.icaTestX,
                               self.icaTestY)
        rpResults = self.mlpc(self.rpTrainX, self.rpTrainY, self.rpTestX,
                              self.rpTestY)
        faResults = self.mlpc(self.faTrainX, self.faTrainY, self.faTestX,
                              self.faTestY)

        print(normalResults)
        print(pcaResults)
        print(icaResults)
        print(rpResults)
        print(faResults)
Exemplo n.º 32
0
def fit_system(intervals, n_components):
    print("fit function entered")
    y_prime = []
    y = []
    #pca = PCA(n_components=n_components)
    pca = FactorAnalysis(n_components=n_components)
    pca.fit(np.concatenate(intervals))
    for interval,t in iterate_intervals(intervals, new_binsize):
        transformed = pca.transform(interval)
        y_prime.append(np.gradient(transformed, t, axis=0))
        y.append(transformed)

    y = np.concatenate(y)
    y_prime = np.concatenate(y_prime)
    A = np.linalg.lstsq(y, y_prime, rcond=None)[0]
    A = A.T
    return A, pca
def factor_analysis(x, dims=3):
  x = to_ndarray(x)
  s = scale(x, axis=0, with_mean=True, with_std=True, copy=True)
  fa_model = FactorAnalysis(n_components=dims, svd_method="lapack")
  fitted = fa_model.fit(s)
  y = fitted.transform(s)
  print("Factor Analysis - Reduced dims from {} to {}".format( x.shape, y.shape ))
  return y, fitted
Exemplo n.º 34
0
def factor_analysis(x, dims=3):
    x = to_ndarray(x)
    s = scale(x, axis=0, with_mean=True, with_std=True, copy=True)
    fa_model = FactorAnalysis(n_components=dims, svd_method="lapack")
    fitted = fa_model.fit(s)
    y = fitted.transform(s)
    print("Factor Analysis - Reduced dims from {} to {}".format(
        x.shape, y.shape))
    return y, fitted
Exemplo n.º 35
0
def dataTransformations(x):

	x.rename(columns={'OCUPVIVPAR': 'Dwellers'}, inplace=True)
	#water
	x['Water'] = x['VPH_AGUAFV']/x['Houses']

	#Sanitation use VPH_EXCSA and VPH_NODREN
	x['Sanitation'] = (x['Houses'] - x['VPH_EXCSA'] + x['VPH_NODREN']) / (2.*x['Houses'])

	#Overcrowding use VPH_1CUART and PRO_OCUP_C
	# x['Density'] = 1. - 1./(1. +x['PRO_OCUP_C'])
	x['Density'] = x['PRO_OCUP_C']-2.
	x.loc[x.Density<0,'Density'] = 0.
	x['Density'] = 1. - 1./(1. + x.Density)
	x['Density'] = x['Density']/x['Density'].max()
	
	#Structure VPH_1CUART and VPH_PISOTI
	x['Structure'] = (x['VPH_PISOTI'] + x['VPH_1CUART']) / (2*x['Houses'])

	ssiData = pd.DataFrame(normalize(x[['Water','Structure','Density','Sanitation']],axis=0), columns=['Water','Structure','Density','Sanitation'])

	# x.loc[:,'Factor'] = zeros(len(x)	
	facAn = FactorAnalysis(n_components = 1)
	facAn.fit(ssiData)
	x.loc[:,'Factor'] = dot(facAn.components_**2,transpose(ssiData.values))[0]

	#K-Means
	k_meansX = ssiData

	# do the clustering
	k_means = KMeans(n_clusters=4)
	k_means.fit(k_meansX) 
	x.loc[:,'K_Means'] = k_means.labels_

	#linear combination

	x.loc[:,'LC'] = x[['Water','Structure','Sanitation']].sum(axis=1) + (x['PRO_OCUP_C']/ x['PRO_OCUP_C'].max())

	


	#save x to csv
	# x.to_csv(folderPath+'dataTrans.csv')
	return x
Exemplo n.º 36
0
def fs_for_hybrid_data(x_train_left,
                       y_train,
                       x_validate_left,
                       y_validate,
                       method=0,
                       method_threshold=10,
                       is_auto=1):
    if method == 0:
        # None
        selected_x_train = x_train_left
        selected_x_validate = x_validate_left
    elif method == 1:
        # PCA
        print("使用PCA方法,方法结果为:")
        if is_auto == 1:
            pca = PCA(n_components='mle', whiten=False)
        else:
            pca = PCA(n_components=method_threshold, whiten=False)
        selected_x_train = pca.fit(x_train_left).transform(x_train_left)
        print(pca.explained_variance_ratio_)
        selected_x_validate = pca.fit(x_validate_left).transform(
            x_validate_left)
        print(pca.explained_variance_ratio_)
    elif method == 2:
        # 因子分析
        fa = FactorAnalysis(n_components=method_threshold)
        selected_x_train = fa.fit(x_train_left).transform(x_train_left)
        selected_x_validate = fa.fit(x_validate_left).transform(
            x_validate_left)
    else:
        # 卡方检验
        selected_x_train = SelectKBest(chi2, k=method_threshold).fit_transform(
            x_train_left, y_train)
        selected_x_validate = SelectKBest(chi2,
                                          k=method_threshold).fit_transform(
                                              x_validate_left, y_validate)

    # 降维后再次进行标准化
    minmax_scaler = MinMaxScaler()
    selected_x_train = minmax_scaler.fit_transform(selected_x_train)
    selected_x_validate = minmax_scaler.fit_transform(selected_x_validate)

    return selected_x_train, selected_x_validate
Exemplo n.º 37
0
def test_factor_analysis():
    """Test FactorAnalysis ability to recover the data covariance structure
    """
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) \
                * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    fa = FactorAnalysis(n_components=n_components)
    fa.fit(X)
    X_t = fa.transform(X)
    assert_true(X_t.shape == (n_samples, n_components))

    assert_almost_equal(fa.loglike_[-1], fa.score(X).sum())

    # Make log likelihood increases at each iteration
    assert_true(np.all(np.diff(fa.loglike_) > 0.))

    # Sample Covariance
    scov = np.cov(X, rowvar=0., bias=1.)

    # Model Covariance
    mcov = fa.get_covariance()
    diff = np.sum(np.abs(scov - mcov)) / W.size
    assert_true(diff < 0.1, "Mean absolute difference is %f" % diff)

    fa = FactorAnalysis(n_components=n_components,
                        noise_variance_init=np.ones(n_features))
    assert_raises(ValueError, fa.fit, X[:, :2])
Exemplo n.º 38
0
from sklearn.lda import LDA

iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit_transform(X)

lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

fa = FactorAnalysis(n_components=2)
X_r3 = fa.fit(X).transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
              % str(pca.explained_variance_ratio_))
print(sum(pca.explained_variance_ratio_))
plt.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
plt.legend(loc="best")
plt.title('PCA of IRIS dataset')

plt.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], c=c, label=target_name)
plt.legend(loc="best")
def test_factor_analysis():
    # Test FactorAnalysis ability to recover the data covariance structure
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
        assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))

    fa1.max_iter = 1
    fa1.verbose = True
    assert_warns(ConvergenceWarning, fa1.fit, X)

    # Test get_covariance and get_precision with n_components == n_features
    # with n_components < n_features and with n_components == 0
    for n_components in [0, 2, X.shape[1]]:
        fa.n_components = n_components
        fa.fit(X)
        cov = fa.get_covariance()
        precision = fa.get_precision()
        assert_array_almost_equal(np.dot(cov, precision),
                                  np.eye(X.shape[1]), 12)
from data import load_data
from sklearn.decomposition import FactorAnalysis
try:
    import cPickle as pickle
except:
    import pickle

# Factor Analysis

# ================================================================
# Apply factor analysis on the tf-idf matrix and transform raw documents into
# intermediate representation.
docs_tfidf, vocab_tfidf, vocabulary = load_data(subset='all')
n_components = 40
fa = FactorAnalysis(n_components=n_components)
fa.fit(docs_tfidf.toarray())
fa_words = fa.transform(vocab_tfidf.toarray())

# Create a dict to hold the new pca words.
fa_dict = dict(zip(vocabulary, fa_words))

# Store the intermediate representation pca words on disk.
fa_dict_filename = 'fa_dict.pk'
if not os.path.exists(fa_dict_filename):
    fa_dict_file = open(fa_dict_filename, 'w')
    pickle.dump(fa_dict, fa_dict_file)

# Store estimator on dist for further usage.
fa_estimator_filename = 'fa_estimator.pk'
if not os.path.exists(fa_estimator_filename):
    fa_estimator_file = open(fa_estimator_filename, 'w')
Exemplo n.º 41
0
def factorAna(x,testData,n_components):
    fa = FactorAnalysis(n_components) 
    fa.fit(x)   
    newData = fa.transform(testData)  
    return newData    
Exemplo n.º 42
0
def initalizeParams(Y, k, method = 'standard'):
	"""
	initializes parameters. 
	By default, (method set to "standard") initializes using a mixture model. 
	If method is set to "high_dimensional", first does dimensionality reduction using factor analysis 
	and then clusters the low-dimensional data. 
	Checked.
	"""
	assert(method in ['high_dimensional', 'standard'])
	if method == 'high_dimensional':
		N, D = Y.shape
		#initialize using factor analysis. 
		model = FactorAnalysis(n_components = 5)
		low_dim_Y = model.fit_transform(Y)
		kmeans_model = KMeans(n_clusters = k)
		z = kmeans_model.fit_predict(low_dim_Y)
		cluster_mus = np.zeros([D, k])
		cluster_weights = np.zeros([k,])
		cluster_sigmas = np.zeros([D, k])
		
		for z_i in sorted(set(z)):
			idxs = (z == z_i)
			cluster_weights[z_i] = np.mean(idxs)
			cluster_Y = Y[idxs, :]
			cluster_Y_is_nonzero = np.abs(cluster_Y) > 1e-6
			cluster_mus[:, z_i] = cluster_Y.sum(axis = 0) / cluster_Y_is_nonzero.sum(axis = 0)
			
			cluster_sigmas[:, z_i] = np.sqrt(((cluster_Y ** 2).sum(axis = 0) - 2 * cluster_mus[:, z_i] * (cluster_Y.sum(axis = 0)) + cluster_mus[:, z_i]**2 * cluster_Y_is_nonzero.sum(axis = 0)) / cluster_Y_is_nonzero.sum(axis = 0))
			for j in range(1, 5):
				assert(np.abs(cluster_sigmas[j, z_i] - np.std(cluster_Y[cluster_Y_is_nonzero[:, j], j])) < 1e-4)		
		
		
	if method == 'standard':
		N, D = Y.shape
		model = GMM(n_components = k)
		imputedY = deepcopy(Y)
		for j in range(D):
			non_zero_idxs = np.abs(Y[:, j]) > 1e-6
			for i in range(N):
				if Y[i][j] == 0:
					imputedY[i][j] = np.random.choice(Y[non_zero_idxs, j])
		model.fit(imputedY)
		cluster_mus = model.means_.transpose()
		cluster_weights = model.weights_
		cluster_sigmas = np.sqrt(model.covars_.transpose())
		
	#now fit decay coefficient
	means = []
	ps = []
	for j in range(D):
		non_zero_idxs = np.abs(Y[:, j]) > 1e-6
		means.append(Y[non_zero_idxs, j].mean())
		ps.append(1 - non_zero_idxs.mean())
	
	
	decay_coef, pcov = curve_fit(exp_decay, means, ps)
	mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2))))
	print 'Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse)
	
	decay_coef = decay_coef[0]

	
	assert(np.all(cluster_sigmas > 0))
	return cluster_mus, cluster_sigmas, cluster_weights, decay_coef

kf = cross_validation.KFold(cdata.shape[0], n_folds=4)

max_components=30

sc=numpy.zeros((max_components,4))

for n_components in range(1,max_components):
    fa=FactorAnalysis(n_components=n_components)
    fold=0
    for train,test in kf:
        train_data=cdata[train,:]
        test_data=cdata[test,:]

        fa.fit(train_data)
        sc[n_components,fold]=fa.score(test_data)
        fold+=1

meanscore=numpy.mean(sc,1)
meanscore[0]=-numpy.inf
maxscore=numpy.argmax(meanscore)
print ('crossvalidation suggests %d components'%maxscore)

# now run it on full dataset to get components
fa=FactorAnalysis(n_components=maxscore)
fa.fit(cdata)

for c in range(maxscore):
    s=numpy.argsort(fa.components_[c,:])
    print('')
Exemplo n.º 44
0
def learn(data):
    model=FA(n_components =2)
    model.fit(data)
    return PreferenceGenerator(model.components_)
Exemplo n.º 45
0
def simulate(data, factors=0, maxtrials=5, multiplier=1, seed=0):
    n = len(data)
    dim = len(data[0])
    simulated = np.zeros((n,dim))
    distribution = np.zeros((n,dim))
    iteration = 0
    BestRMSR = 1
    trialsWithoutImprovement = 0

    #apply distribution from supplied data
    distribution = data.copy()
    TargetCorr = corr(data.T)
    IntermidiateCorr = TargetCorr.copy()
    BestCorr = IntermidiateCorr
    #print data.shape
    #print simulated.shape
    #print TargetCorr, TargetCorr.shape

    if(factors == 0):
        eigvalsObserved = np.linalg.eigvals(IntermidiateCorr)
        eigvalsRandom = np.zeros((100,dim))
        randomData = np.zeros((n,dim))

        for i in range(0, 100):
            for j in range(0, dim):
                randomData[:, j] = np.random.permutation(distribution[:, j])
            eigvalsRandom[i, :] = np.linalg.eigvals(corr(randomData.T))
        eigvalsRandom = np.mean(eigvalsRandom, axis=0)
        factors = max(1, np.sum(eigvalsObserved > eigvalsRandom))

    #steps 5,6
    SharedComp = np.random.normal(0, 1, (n, factors))
    UniqueComp = np.random.normal(0, 1, (n, dim))
    SharedLoad = np.zeros((dim, factors))
    UniqueLoad = np.zeros(dim)

    while trialsWithoutImprovement < maxtrials:
        iteration += 1

        #Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8)
        fa = FactorAnalysis()
        fa.n_components = factors
        fa.fit(IntermidiateCorr)
        FactLoadings = fa.components_.T
        #print FactLoadings.shape

        if (factors == 1):
            SharedLoad[:, 0] = FactLoadings[:, 0]
        else:
            SharedLoad = FactLoadings
        #print SharedLoad

        SharedLoad = np.clip(SharedLoad, -1, 1)
        #print SharedLoad

        if (SharedLoad[0, 0] < 0):
            SharedLoad *= -1
        #print SharedLoad

        SharedLoadSq = SharedLoad * SharedLoad
        #print SharedLoadSq

        for i in range(0, dim):
            SharedLoadSum = np.sum(SharedLoadSq[i, :])
            if(SharedLoadSum < 1):
                UniqueLoad[i] = 1 - SharedLoadSum
            else:
                UniqueLoad[i] = 0
        UniqueLoad = np.sqrt(UniqueLoad)
        #print UniqueLoad

        MergedShare = np.dot(SharedComp, SharedLoad.T)
        for i in range(0, dim):
            simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i]
        #print simulated

        #Replace normal with nonnormal distributions (step 9)
        for i in range(0, dim):
            indices = np.argsort(simulated[:, i])
            simulated = np.array(simulated)[indices]
            simulated[:, i] = distribution[:, i]
        #print simulated
        #print distribution

        #Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12)
        ReproducedCorr = corr(simulated.T)
        ResidualCorr = TargetCorr - ReproducedCorr;
        #print ResidualCorr

        RMSR = np.sqrt(np.sum(np.tril(ResidualCorr) ** 2) / (0.5 * (dim*dim - dim)))
        #print RMSR

        if (RMSR < BestRMSR):
            BestRMSR = RMSR
            BestCorr = IntermidiateCorr
            BestRes = ResidualCorr
            IntermidiateCorr = IntermidiateCorr + multiplier*ResidualCorr
            trialsWithoutImprovement = 0
        else:
            trialsWithoutImprovement += 1
            CurrentMultiplier = multiplier * (0.5 ** trialsWithoutImprovement)
            try:
                IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes
            except NameError:
                BestRes = ResidualCorr
                IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes

    #Construct the data set with the lowest RMSR correlation (step 13)
    fa = FactorAnalysis()
    fa.n_components = factors
    fa.fit(BestCorr)
    FactLoadings = fa.components_.T

    if (factors == 1):
        SharedLoad[:, 0] = FactLoadings[:, 0]
    else:
        SharedLoad = FactLoadings

    SharedLoad = np.clip(SharedLoad, -1, 1)

    if (SharedLoad[0, 0] < 0):
        SharedLoad *= -1

    SharedLoadSq = SharedLoad * SharedLoad

    for i in range(0, dim):
        SharedLoadSum = np.sum(SharedLoadSq[i, :])
        if(SharedLoadSum < 1):
            UniqueLoad[i] = 1 - SharedLoadSum
        else:
            UniqueLoad[i] = 0
    UniqueLoad = np.sqrt(UniqueLoad)

    MergedShare = np.dot(SharedComp, SharedLoad.T)
    for i in range(0, dim):
        simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i]

    simulated = preprocessing.scale(simulated)

    for i in range(0, dim):
        indices = np.argsort(simulated[:, i])
        simulated = np.array(simulated)[indices]
        simulated[:, i] = distribution[:, i]

    #return the simulated data set (step 14)
    #print 'RMSR', BestRMSR

    return simulated