Пример #1
0
def factor_analysis_method(train_x, train_y, validate_x, validate_y, fa_threshold, is_split=1):
    # 缺失值填充
    train_x = train_x.fillna(0)
    train_x = train_x.values
    validate_x = validate_x.fillna(0)
    validate_x = validate_x.values

    # 归一化,之前必须保证没有空值,之后自动变成ndarray
    # scaler = MinMaxScaler()
    # train_x = scaler.fit_transform(train_x)
    # validate_x = scaler.fit_transform(validate_x)

    # dataframe变成没有标签的ndarray,以便可以输入模型
    train_y = train_y.values
    validate_y = validate_y.values

    if is_split == 1:
        # 先把onehot列单独拿出来
        onehot_train_x_left = train_x[:, :30]
        train_x_mid = train_x[:, 30:454]
        # onehot_train_x_right = train_x[:, 454:]
        onehot_validate_x_left = validate_x[:, :30]
        validate_x_mid = validate_x[:, 30:454]
        # onehot_validate_x_right = validate_x[:, 454:]
    else:
        train_ts_code_1 = train_x[:, 0]
        train_x_mid = train_x[:, 1:]
        valid_ts_code_1 = validate_x[:, 0]
        validate_x_mid = validate_x[:, 1:]

    # factor_analysis
    fa = FactorAnalysis(n_components=fa_threshold)
    selected_train_x = fa.fit(train_x_mid).transform(train_x_mid)
    selected_validate_x = fa.fit(validate_x_mid).transform(validate_x_mid)

    # 把ts_code再重新拼回来
    if is_split == 1:  # ts_code有30列
        selected_train_x = np.hstack((onehot_train_x_left, selected_train_x))
        selected_validate_x = np.hstack((onehot_validate_x_left, selected_validate_x))
    else:  # ts_code只有一列
        # print(train_ts_code_1.reshape(-1,1).shape)
        # print(selected_train_x.shape)
        selected_train_x = np.hstack((train_ts_code_1.reshape(-1, 1), selected_train_x))
        selected_validate_x = np.hstack((valid_ts_code_1.reshape(-1, 1), selected_validate_x))

    return selected_train_x, train_y, selected_validate_x, validate_y
Пример #2
0
    def expMlpc(self):
        pca = PCA(n_components=self.pcaBest)
        pca.fit(self.pcaDataX)
        self.pcaDataX = pca.transform(self.pcaDataX)
        self.pcaTrainX, self.pcaTestX, self.pcaTrainY, self.pcaTestY = train_test_split(
            self.pcaDataX, self.pcaDataY, test_size=0.3, random_state=0)
        print(self.pcaTrainX.shape)

        ica = FastICA(n_components=self.icaBest, max_iter=1000)
        ica.fit(self.icaDataX)
        self.icaDataX = ica.transform(self.icaDataX)
        self.icaTrainX, self.icaTestX, self.icaTrainY, self.icaTestY = train_test_split(
            self.icaDataX, self.icaDataY, test_size=0.3, random_state=0)
        print(self.icaTrainX.shape)

        rp = random_projection.GaussianRandomProjection(
            n_components=self.rpBest)
        rp.fit(self.rpDataX)
        self.rpDataX = rp.transform(self.rpDataX)
        self.rpTrainX, self.rpTestX, self.rpTrainY, self.rpTestY = train_test_split(
            self.rpDataX, self.rpDataY, test_size=0.3, random_state=0)
        print(self.rpTrainX.shape)

        fa = FactorAnalysis(n_components=self.faBest, max_iter=1000)
        fa.fit(self.faDataX)
        self.faDataX = fa.transform(self.faDataX)
        self.faTrainX, self.faTestX, self.faTrainY, self.faTestY = train_test_split(
            self.faDataX, self.faDataY, test_size=0.3, random_state=0)
        print(self.faTrainX.shape)

        normalResults = self.mlpc(self.trainX, self.trainY, self.testX,
                                  self.testY)
        pcaResults = self.mlpc(self.pcaTrainX, self.pcaTrainY, self.pcaTestX,
                               self.pcaTestY)
        icaResults = self.mlpc(self.icaTrainX, self.icaTrainY, self.icaTestX,
                               self.icaTestY)
        rpResults = self.mlpc(self.rpTrainX, self.rpTrainY, self.rpTestX,
                              self.rpTestY)
        faResults = self.mlpc(self.faTrainX, self.faTrainY, self.faTestX,
                              self.faTestY)

        print(normalResults)
        print(pcaResults)
        print(icaResults)
        print(rpResults)
        print(faResults)
Пример #3
0
def testAlgorithm():
    random.seed(30)
    np.random.seed(32)
    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1
    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    colors = ['red', 'blue', 'green']
    cluster_ids = sorted(list(set(ids)))
    model = FactorAnalysis(n_components=k)
    factor_analysis_Zhat = model.fit_transform(Y)
    figure(figsize=[15, 5])
    subplot(131)
    for id in cluster_ids:
        scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4)
        title('True Latent Positions\nFraction of Zeros %2.3f' %
              (Y == 0).mean())
        xlim([-4, 4])
        ylim([-4, 4])
    subplot(132)
    for id in cluster_ids:
        scatter(Zhat[ids == id, 0],
                Zhat[ids == id, 1],
                color=colors[id - 1],
                s=4)
        xlim([-4, 4])
        ylim([-4, 4])
        title('ZIFA Estimated Latent Positions')
        #title(titles[method])
    subplot(133)
    for id in cluster_ids:
        scatter(factor_analysis_Zhat[ids == id, 0],
                factor_analysis_Zhat[ids == id, 1],
                color=colors[id - 1],
                s=4)
        xlim([-4, 4])
        ylim([-4, 4])
        title('Factor Analysis Estimated Latent Positions')

    show()
Пример #4
0
    def factor_analysis(self, data, **kwargs):
        fa = FactorAnalysis(n_components=self.p, random_state=0, **kwargs)

        y = np.concatenate(data, -1)
        y = np.concatenate(y, 0)
        y = y.transpose()

        x = fa.fit_transform(y)

        split_ind = np.cumsum([d.shape[-1] for d in data])[:-1]
        x = x.transpose()
        x = np.array_split(x, split_ind, axis=-1)
        w = fa.components_.transpose()
        w = np.split(w, data[0].shape[0])
        w = np.stack(w, axis=0)

        return x, w
Пример #5
0
def fit_system(intervals, n_components):
    print("fit function entered")
    y_prime = []
    y = []
    #pca = PCA(n_components=n_components)
    pca = FactorAnalysis(n_components=n_components)
    pca.fit(np.concatenate(intervals))
    for interval,t in iterate_intervals(intervals, new_binsize):
        transformed = pca.transform(interval)
        y_prime.append(np.gradient(transformed, t, axis=0))
        y.append(transformed)

    y = np.concatenate(y)
    y_prime = np.concatenate(y_prime)
    A = np.linalg.lstsq(y, y_prime, rcond=None)[0]
    A = A.T
    return A, pca
Пример #6
0
def get_model(args):
    """
    Get the base model class for the dimension reduction
    requested in the arguments

    :param args: output of ArgumentParser().parse_args()
    :return: model to be used for dimension reduction
    """
    if args.method == 'umap' and UMAP_AVAILABLE:
        model = umap.UMAP(n_components=args.dims)
    elif args.method == 'pca':
        model = PCA(n_components=args.dims)
    elif args.method == 'pca-scaled':
        model = ScaledPCA(args.dims)
    elif args.method == 'rpca':
        model = RandomizedPCA(n_components=args.dims)
    elif args.method == 'tsne':
        model = TSNE(n_components=args.dims)
    elif args.method == 'mctsne':
        model = MCTSNE(n_components=args.dims, n_jobs=args.njobs)
    elif args.method == 'spectral':
        model = SpectralEmbedding(n_components=args.dims, n_jobs=args.njobs)
    elif args.method == 'lle':
        model = LocallyLinearEmbedding(n_components=args.dims,
                                       n_jobs=args.njobs)
    elif args.method == 'isomap':
        model = Isomap(n_components=args.dims, n_jobs=args.njobs)
    elif args.method == 'mds':
        model = MDS(n_components=args.dims, n_jobs=args.njobs)
    elif args.method == 'fa':
        model = FactorAnalysis(n_components=args.dims)
    elif args.method == 'fica':
        model = FastICA(n_components=args.dims)
    elif args.method == 'zifa' and ZIFA_AVAILABLE:
        model = ZIFA_Wrapper(args.dims)
    elif args.method == 'lda':
        model = LatentDirichletAllocation(args.dims)
    elif args.method == 'nmf':
        model = NMF(args.dims)
    elif args.method == 'scscope' and SCSCOPE_AVAILABLE:
        model = ScScope(args.dims)
    else:
        print("ERROR: Invalid embedding option", file=sys.stderr)
        exit()
    return model
Пример #7
0
def main():
    df = pd.read_csv('data/Pokemon.csv')
    X = df[[
        'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation',
        'Legendary'
    ]].values

    fa = FactorAnalysis(n_components=2, tol=0.001, random_state=42)
    X_new = fa.fit_transform(X)

    print('X_new')
    print(X_new.shape)
    print(X_new)
    print('')

    print('components')
    print(fa.components_.shape)
    print(fa.components_)
def reduce_dimension(name, x, n_components):
    algorithms = {
        'factor_analysis':
        FactorAnalysis(random_state=0, n_components=n_components),
        'fast_ica':
        FastICA(random_state=0, n_components=n_components),
        'nmf':
        Pipeline([('min_max', MinMaxScaler()),
                  ('nmf', NMF(random_state=0, n_components=n_components))]),
        'pca':
        PCA(random_state=0, n_components=n_components),
        'sparse_pca':
        SparsePCA(random_state=0, n_components=n_components),
        'truncated_svd':
        TruncatedSVD(random_state=0, n_components=n_components)
    }
    return Pipeline([(name, algorithms.get(name)),
                     ('min_max', MinMaxScaler())]).fit_transform(x)
Пример #9
0
    def plot_model_selection(self, n_components):
        plt.figure(figsize=(8, 6))

        for reducer_name in ["PCA", "Factor Analysis"]:
            if reducer_name == "PCA":
                reducer = PCA()
            else:
                reducer = FactorAnalysis()

            scores, component = self._run_CV(reducer, n_components)
            self._plot_scores(scores, component, n_components, reducer_name)

        plt.xlabel('nb of components')
        plt.ylabel('CV scores')
        plt.legend(loc='lower right')
        plt.title(
            "Model selection with Probabilistic PCA and Factor Analysis ")
        plt.show()
Пример #10
0
def fa(n_com):
    """
    使用sklearn工具包的FactorAnalysis方法分析所有候选变量的共线性,输出指定因子数的因子成分矩阵
    :param n_com: int
    因子数(降维后样本的变量数)
    :return: array
    降维后的样本变量值矩阵
    """
    feature_all = pd.read_csv('./data/samples_all.csv', index_col=0)  # 读取样本集合,第一列是样本标识号
    data = feature_all.fillna(0)
    fa = FactorAnalysis(n_components=n_com)
    #降维
    data_fa = fa.fit_transform(data.iloc[:, :-1], y=data.iloc[:, -1])

    #因子成分矩阵
    factor = fa.components_
    factor_df = pd.DataFrame(factor,columns=feature_all.iloc[:, :-1].columns)
    return factor_df
Пример #11
0
    def __init__(self, n_components=10, Data=None):
        '''
        :param n_components: dimensionality of y
        :param Data: dataset
        '''
        self.model = FactorAnalysis(n_components=n_components)

        self.n_components = n_components
        if Data == None:
            self.dataset = Dataset()
            self.dataset.generate()
        else:
            self.dataset = Data
        self.data = self.dataset.data
        self.aic = []
        self.bic = []
        self.aic_b = None
        self.colors = ['red', 'blue']
Пример #12
0
    def _fit(self, X, n_components, sample_size):
        self._reset()
        Y = np.delete(X, range(0,14), axis=1) #delete column numbers:1 included - 13 excluded, the knob columns are deleted
        np.random.shuffle(Y) #Shuffled only by the rows, by default
        Y=Y[:sample_size,:]#sample only 1000 rows from the matrix
        Y=Y.transpose()
        print("Shape before:", Y.shape)
        model= FactorAnalysis(n_components=n_components, random_state=0)
        model.fit_transform(Y)
        self.model_=model
        print(self.model_.components_.shape) #metrics X factors
        #filtering out any components with 0 values
        self.model_.components_=self.model_.components_.transpose()
        components_mask = np.sum(self.model_.components_ != 0.0, axis=1) > 0.0
        self.components_ = self.model_.components_[components_mask]
        print("Shape after:",self.components_.shape)

        return self
Пример #13
0
def initializeParams(Y, K, singleSigma=False, makePlot=False):
    """
    initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting. 
    Checked. 
    Input: 
    Y: data matrix, n_samples x n_genes
    K: number of latent components
    singleSigma: uses only a single sigma as opposed to a different sigma for every gene 
    makePlot: makes a mu - p_0 plot and shows the decaying exponential fit. 
    Returns: 
    A, mus, sigmas, decay_coef: initialized model parameters. 
    """

    N, D = Y.shape
    model = FactorAnalysis(n_components=K)
    zeroedY = deepcopy(Y)
    mus = np.zeros([D, 1])
    for j in range(D):
        non_zero_idxs = np.abs(Y[:, j]) > 1e-6
        mus[j] = zeroedY[:, j].mean()
        zeroedY[:, j] = zeroedY[:, j] - mus[j]
    model.fit(zeroedY)
    A = model.components_.transpose()
    sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose()
    if singleSigma:
        sigmas = np.mean(sigmas) * np.ones(sigmas.shape)
    # now fit decay coefficient
    means = []
    ps = []
    for j in range(D):
        non_zero_idxs = np.abs(Y[:, j]) > 1e-6
        means.append(Y[non_zero_idxs, j].mean())
        ps.append(1 - non_zero_idxs.mean())
    decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05)
    decay_coef = decay_coef[0]
    mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2))))
    if (mse > 0) and makePlot:
        figure()
        scatter(means, ps)
        plot(np.arange(min(means), max(means), .1),
             np.exp(-decay_coef * (np.arange(min(means), max(means), .1) ** 2)))
        title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse))
        show()
    return A, mus, sigmas, decay_coef
Пример #14
0
def EM_batched(y_batches,
               latent_dim,
               n_iter,
               ss_eps=1e-8,
               print_interval=None):
    #initilzie with Factor Analysis
    n = latent_dim
    num_batches = len(y_batches)
    fa = FactorAnalysis(n_components=n)
    x_hat_batches, P_batches, P_adj_batches = [], [], []
    for i in range(num_batches):
        np.random.seed(42)
        x_hat = fa.fit_transform(y_batches[i])
        T = len(y_batches[i])
        P = np.repeat(np.eye(n)[np.newaxis, :, :], T, axis=0)
        P_adj = np.array(
            [np.outer(x_hat[t], x_hat[t - 1]) for t in range(1, T)])
        x_hat_batches.append(x_hat)
        P_batches.append(P)
        P_adj_batches.append(P_adj)

    #run EM
    ll_vals = np.zeros(n_iter)
    for i in range(n_iter):
        A, Q, C, R, pi_1, V_1 = M_step_batched(y_batches, x_hat_batches,
                                               P_batches, P_adj_batches)
        x_hat_batches, P_batches, P_adj_batches = E_step_batched(y_batches,
                                                                 A,
                                                                 Q,
                                                                 C,
                                                                 R,
                                                                 pi_1,
                                                                 V_1,
                                                                 ss_eps=ss_eps)
        if print_interval is not None and i % print_interval == 0:
            print("iter", i)
            ll_vals[i] = log_likelihood_batched(x_hat_batches, y_batches, A, Q,
                                                C, R, pi_1, V_1)
        else:
            ll_vals[i] = np.nan
    if print_interval is not None:
        print("iter", n_iter)

    return x_hat_batches, P_batches, P_adj_batches, A, Q, C, R, pi_1, V_1, ll_vals
Пример #15
0
    def _initialize(y, d_latent):
        """ Initialize models parameters and initial latents with Factor Analysis

        Args:
            y (np.ndarray): (T, N, d_obs), observed data
            d_latent (int): dimensionality of latent space

        Returns:
            A (np.ndarray): State dynamics matrix (d_latent, d_latent)
            C (np.ndarray): Observation matrix (d_obs, d_latent)
            Q (np.ndarray): Covariance matrix of state noise (d_latent, d_latent)
            R (np.ndarray): Covariance "matrix" of observation noise; only diagonal elements (d_obs,)
            pi_0 (np.ndarray): Initial state estimate (d_latent, )
            V_0 (np.ndarray): Initial state covariance estimate (d_latent, )
        """
        if y.ndim == 2:
            y = np.expand_dims(y, 1)
        assert y.shape[
            2] >= d_latent, 'FA only works for d_obs > d_latent; will implement AR'
        T = y.shape[0]
        N = y.shape[1]
        y = y.reshape(-1, y.shape[-1])
        fa = FactorAnalysis(n_components=d_latent)
        fa.fit(y)
        C = fa.components_.T
        R = fa.noise_variance_

        # Woodbury matrix identity for y(R + CC')C
        Phi = np.diag(1. / R)
        temp1 = Phi.dot(C)
        temp2 = Phi - temp1.dot(
            LA.pinv(np.eye(d_latent) + C.T.dot(temp1))).dot(temp1.T)
        temp1 = y.dot(temp2).dot(C)
        pi_0 = np.mean(temp1, axis=0)
        Q = np.cov(temp1.T)
        V_0 = Q

        t1 = temp1[:N * T - 1]
        t2 = temp1[1:N * T, :]

        A = LA.pinv(t1.T.dot(t1) + Q).dot(t1.T.dot(t2))

        return A, C, Q, R, pi_0, V_0
Пример #16
0
    def initialize(self, obs):
        '''Initialize data using factor analysis and ordinary least squares'''

        self.set_obs(obs)
        fa = FactorAnalysis(n_components=self.d_latent)
        fa.fit(np.concatenate(self.obs))
        R = np.diag(fa.noise_variance_)
        Q = np.eye(
            self.d_latent)  #there should be a better way to initialize this
        C = fa.components_.T
        xt = np.concatenate([fa.transform(y[:-1, :]) for y in self.obs])
        xtp1 = np.concatenate([fa.transform(y[1:, :]) for y in self.obs])
        #A = np.linalg.lstsq(xt, xtp1)[0].T
        A = np.random.randn(self.d_latent, self.d_latent)
        A = (A - A.T) / 2  #random skew symmetric matrix
        N = np.sum(y.shape[0] for y in obs)
        pi_0 = np.zeros((len(obs), self.d_latent))
        V_0 = np.array([np.eye(self.d_latent) for i in range(len(obs))])
        self.set_params(A, C, Q, R, pi_0, V_0)
Пример #17
0
def fs_for_hybrid_data(x_train_left,
                       y_train,
                       x_validate_left,
                       y_validate,
                       method=0,
                       method_threshold=10,
                       is_auto=1):
    if method == 0:
        # None
        selected_x_train = x_train_left
        selected_x_validate = x_validate_left
    elif method == 1:
        # PCA
        print("使用PCA方法,方法结果为:")
        if is_auto == 1:
            pca = PCA(n_components='mle', whiten=False)
        else:
            pca = PCA(n_components=method_threshold, whiten=False)
        selected_x_train = pca.fit(x_train_left).transform(x_train_left)
        print(pca.explained_variance_ratio_)
        selected_x_validate = pca.fit(x_validate_left).transform(
            x_validate_left)
        print(pca.explained_variance_ratio_)
    elif method == 2:
        # 因子分析
        fa = FactorAnalysis(n_components=method_threshold)
        selected_x_train = fa.fit(x_train_left).transform(x_train_left)
        selected_x_validate = fa.fit(x_validate_left).transform(
            x_validate_left)
    else:
        # 卡方检验
        selected_x_train = SelectKBest(chi2, k=method_threshold).fit_transform(
            x_train_left, y_train)
        selected_x_validate = SelectKBest(chi2,
                                          k=method_threshold).fit_transform(
                                              x_validate_left, y_validate)

    # 降维后再次进行标准化
    minmax_scaler = MinMaxScaler()
    selected_x_train = minmax_scaler.fit_transform(selected_x_train)
    selected_x_validate = minmax_scaler.fit_transform(selected_x_validate)

    return selected_x_train, selected_x_validate
Пример #18
0
def selectfeatures_pca(n_features, step, X, Y):
    pca = PCA(svd_solver='full')
    fa = FactorAnalysis()
    n_components = np.arange(0, n_features, step)
    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X, Y)))
        fa_scores.append(np.mean(cross_val_score(fa, X, Y)))
        print(pca_scores)
    plt.figure()
    plt.plot(n_components, pca_scores, 'b', label='PCA scores')
    plt.plot(n_components, fa_scores, 'r', label='FA scores')
    plt.xlabel('nb of components')
    plt.ylabel('CV scores')
    plt.legend(loc='lower right')
    plt.title('Feature Selection using PCA and FA')
    plt.savefig('FeatureSelectionPCAnFA.png', format='png')
Пример #19
0
    def reduce(self, x, algorithm='pca', n_components='mle'):
        from sklearn.decomposition import PCA, FactorAnalysis, TruncatedSVD, randomized_svd

        if algorithm == 'pca':
            _method = PCA(n_components=n_components,
                          copy=True,
                          svd_solver='auto')
            _res = _method.fit_transform(x)
        elif algorithm == 'factor':
            # 因子分析
            _method = FactorAnalysis(n_components=n_components)
            _res = _method.fit_transform(x)
        elif algorithm == 'rsvd':
            _res = randomized_svd(M=x, n_components=4)
        elif algorithm == 'tsvd':
            _method = TruncatedSVD(n_components=3, n_iter=4)
            _res = _method.fit_transform(x)

        return _res
Пример #20
0
def factor_analysis(tests):
	from sklearn.decomposition import FactorAnalysis
	from sklearn.cross_validation import cross_val_score
	
	matrix = correct_matrix(tests,kind='ctrl')
	print(matrix.shape)
	# matrix must have a number of rows divisible by 3.  
	# if it does not, eliminate some rows, or pass cv=a to cross_val_score,
	# where 'a' is a number by which the number of rows is divisible.  
	fa = FactorAnalysis()
	fa_scores = []
	n_components = np.arange(1,41)
	for n in n_components:
		fa.n_components = n
		fa_scores.append(np.mean(cross_val_score(fa, matrix)))

	plt.plot(n_components,fa_scores)
	
	return n_components,fa_scores
Пример #21
0
def dimension_reduction(train_x, train_y, test_x, n_col, method = 'fact'):
    # Obtain column names
    attr_list = train_x.columns
    
    # Using RFE to rank feactures and then select
    if method == 'RFE':
        # Using RFE to rank attributes
        lin_reg = LinearRegression()
        rfe = RFE(lin_reg, n_col)
        fit = rfe.fit(train_x, train_y)
    
        # Selecte most relevant attributes for machien learning
        fit_list = fit.support_.tolist()
        indexes = [index for index in range(len(fit_list)) if fit_list[index] == True]
    
        # Print out attributes selected and ranking
        print('\nAttributes selected are: ', itemgetter(*indexes)(attr_list))
        print('\nAttributes Ranking: ', fit.ranking_)

        train_x_returned = train_x.iloc[:,indexes]
        test_x_returned = test_x.iloc[:,indexes]
    
    # Using factor analysis
    elif method == 'fact':
        fact_anal = FactorAnalysis(n_components=n_col)
        train_x_returned = pd.DataFrame(fact_anal.fit_transform(train_x))
        test_x_returned = pd.DataFrame(fact_anal.transform(test_x))
    
        train_x_returned.columns = [''.join(['feature_',str(i)]) for i in list(train_x_returned.columns)]
        test_x_returned.columns = [''.join(['feature_', str(i)]) for i in list(test_x_returned.columns)]
    
    # Using PCA
    elif method == 'PCA':
        pca_down = PCA(n_components=n_col)
        train_x_returned = pd.DataFrame(pca_down.fit_transform(train_x))
        test_x_returned = pd.DataFrame(pca_down.transform(test_x))
    
        train_x_returned.columns = [''.join(['feature_',str(i)]) for i in list(train_x_returned.columns)]
        test_x_returned.columns = [''.join(['feature_', str(i)]) for i in list(test_x_returned.columns)]
    
    # Returned selected or regenerated features
    return train_x_returned, test_x_returned
Пример #22
0
def initialize(trials, params, config):
    """Make skeleton"""
    # TODO: fast initialization for large dataset
    from sklearn.decomposition import FactorAnalysis

    zdim = params["zdim"]
    xdim = params["xdim"]

    # TODO: use only a subsample of trials?
    y = np.concatenate([trial["y"] for trial in trials], axis=0)
    subsample = np.random.choice(y.shape[0], max(y.shape[0] // 10, 50))
    ydim = y.shape[-1]
    fa = FactorAnalysis(n_components=zdim, random_state=0)
    z = fa.fit_transform(y[subsample, :])
    a = fa.components_
    b = np.log(np.maximum(np.mean(y, axis=0, keepdims=True), config["eps"]))
    noise = np.var(y[subsample, :] - z @ a, ddof=0, axis=0)

    # stupid way of update
    # two cases
    # 1) no key
    # 2) empty value (None)
    if params.get("a") is None:
        params.update(a=a)
    if params.get("b") is None:
        params.update(b=b)
    if params.get("noise") is None:
        params.update(noise=noise)

    for trial in trials:
        length = trial["y"].shape[0]

        if trial.get("mu") is None:
            trial.update(mu=fa.transform(trial["y"]))

        if trial.get("x") is None:
            trial.update(x=np.ones((length, xdim, ydim)))

        trial.update({
            "w": np.zeros((length, zdim)),
            "v": np.zeros((length, zdim))
        })
def plot_pca_features(X, sensible_features):

    data = X
    data[data == np.inf] = np.NaN
    data = data.dropna()

    sfindex = [data.columns.get_loc(col) for col in sensible_features]
    print(sfindex)

    # normalisation
    for col in data:
        data[col] -= data[col].min()
        data[col] /= data[col].max()

    pca = PCA(n_components=2)
    pca.fit(data.to_numpy().T)
    fa = FactorAnalysis(n_components=2)
    fa.fit(data.to_numpy().T)

    pca_df = pd.DataFrame(pca.transform(data.to_numpy().T))
    pca_df = pca_df.loc[[col in sensible_features for col in data.columns]]
    # pca_df = pca_df.loc[[data.columns.get_loc(col) for col in sensible_features]]
    pca_df.columns = ["PC1", "PC2"]
    pca_df["x_text"] = pca_df[
        "PC1"]  # + 0.1 * np.array([len(f) for f in data.columns])
    pca_df["y_text"] = pca_df["PC2"]
    pca_df["labels"] = data.columns
    exp1, exp2 = pca.explained_variance_ratio_.round(3)

    plt = ggplot() + \
          geom_hline(yintercept=0, color="white", linetype="dashed") + \
          geom_vline(xintercept=0, color="lightgrey", linetype="dashed") + \
          geom_point(data=pca_df, mapping=aes("PC1", "PC2")) + \
          geom_text(data=pca_df,
                    mapping=aes("x_text", "y_text", label="labels"),
                    size=10,
                    color="black") + \
          xlab(f"PCA1 (explained {exp1*100}% variance)") + \
          ylab(f"PCA2 (explained {exp2*100}% variance)") + \
          ggtitle("Features PCA plot")

    return plt
Пример #24
0
    def fa(self, n):
        fa = FactorAnalysis(n_components=n)
        fa.fit(self.sample)
        m = fa.components_
        factors = pd.DataFrame(fa.components_.T)
        draw_heatmap(factors, "Factor analysis")
        m1 = m**2
        m2 = np.sum(m1, axis=1)
        nn = fa.noise_variance_
        for i in range(n):
            print("component " + str(i) + ": " +
                  str((100 * m2[i]) / (np.sum(m2) + np.sum(nn))))

        print('fa_components: ', fa.components_)
        fa = FactorAnalyzer()
        fa.analyze(pd.DataFrame(data=self.sample),
                   n_factors=n,
                   method='maxres')
        print(fa.get_factor_variance())
        draw_heatmap(fa.loadings, "Loadings")
Пример #25
0
 def dim_reduction_method(self):
     """
     select dimensionality reduction method
     """
     if self.dim_reduction=='pca':
         return PCA()
     elif self.dim_reduction=='factor-analysis':
         return FactorAnalysis()
     elif self.dim_reduction=='fast-ica':
         return FastICA()
     elif self.dim_reduction=='kernel-pca':
         return KernelPCA()
     elif self.dim_reduction=='sparse-pca':
         return SparsePCA()
     elif self.dim_reduction=='truncated-svd':
         return TruncatedSVD()
     elif self.dim_reduction!=None:
         raise ValueError('%s is not a supported dimensionality reduction method. Valid inputs are: \
                          "pca","factor-analysis","fast-ica,"kernel-pca","sparse-pca","truncated-svd".' 
                          %(self.dim_reduction))
Пример #26
0
 def varIfItemDel(self):
     '''
     Returns a pandas series containing the scale's variance if each item were deleted.
     
     This method provides what the scale's variance would be if each item were deleted. This is also displayed 
     when the psychometrics() method is called.
     '''
     varIfItemDel = []
     for i in self.data:
         if self._scoring == 'mean':
             a = self._data.drop(i, axis=1).mean(axis=1)
         elif self._scoring == 'sum':
             a = self._data.drop(i, axis=1).sum(axis=1)
         elif self._scoring == 'z': 
             a = (self._data.drop(i, axis=1).sum(axis=1)-self._data.drop(i, axis=1).sum(axis=1).mean())/(
                 self._data.drop(i, axis=1).sum(axis=1).std())
         elif self._scoring == 'factor':
             a = pd.Series([s[0] for s in FactorAnalysis(n_components=1).fit_transform(self._data.drop(i, axis=1))])
         varIfItemDel.append(a.var())
     return pd.Series(varIfItemDel, index = self.data.columns)
Пример #27
0
    def think(self):
        while True and not self.suicide:
            time.sleep(0.2)
            df = pd.DataFrame(self.xs_not_decomposed).T
            xs = self.scalerx.fit_transform(df.astype(np.float32))
            ser = self.scalery.fit_transform(
                pd.Series(self.ys_not_decomposed).values.astype(
                    np.float32).reshape(-1, 1))

            dec_tmp = FactorAnalysis(self.dec_comps)
            xs = dec_tmp.fit_transform(xs)
            self.decomposer = dec_tmp

            model_tmp = svm.SVR()
            model_tmp.fit(xs, ser.reshape(-1))
            self.model = model_tmp
            count_sleep = np.minimum(60, len(self.ys_not_decomposed))
            while count_sleep > 0 and not self.suicide:
                time.sleep(1)
                count_sleep -= 1
Пример #28
0
def process_dim_reduction(method='pca', n_dim=10):
    """
    Default linear dimensionality reduction method. For each method, return a
    BaseEstimator instance corresponding to the method given as input.
	Attributes
    -------
    method: str, default to 'pca'
    	Method used for dimensionality reduction.
    	Implemented: 'pca', 'ica', 'fa' (Factor Analysis), 
    	'nmf' (Non-negative matrix factorisation), 'sparsepca' (Sparse PCA).
    
    n_dim: int, default to 10
    	Number of domain-specific factors to compute.
    Return values
    -------
    Classifier, i.e. BaseEstimator instance
    """

    if method.lower() == 'pca':
        clf = PCA(n_components=n_dim)

    elif method.lower() == 'ica':
        print('ICA')
        clf = FastICA(n_components=n_dim)

    elif method.lower() == 'fa':
        clf = FactorAnalysis(n_components=n_dim)

    elif method.lower() == 'nmf':
        clf = NMF(n_components=n_dim)

    elif method.lower() == 'sparsepca':
        clf = SparsePCA(n_components=n_dim, alpha=10., tol=1e-4, verbose=10, n_jobs=1)

    elif method.lower() == 'pls':
        clf = PLS(n_components=n_dim)
		
    else:
        raise NameError('%s is not an implemented method'%(method))

    return clf
Пример #29
0
    def think(self):
        while True and not self.suicide:
            if len(self.xs_not_decomposed) >= 1:
                time.sleep(0.2)
                try:
                    df = pd.DataFrame(
                        self.xs_not_decomposed).T.iloc[-self.mem:]
                    df = df.dropna(axis=1)
                    self.features_not_dropped = list(df.columns)
                    scaler_temp = StandardScaler()
                    xs = scaler_temp.fit_transform(df.astype(np.float32))
                    self.scalerx = scaler_temp
                    scaler_temp = StandardScaler()
                    ser = scaler_temp.fit_transform(
                        pd.Series(
                            self.ys_not_decomposed).values[-self.mem:].astype(
                                np.float32).reshape(-1, 1))
                    self.scalery = scaler_temp
                except ValueError:
                    print(traceback.format_exc())
                    pprint(
                        pd.DataFrame(
                            self.xs_not_decomposed).T.iloc[-self.mem:])
                    pprint(
                        pd.Series(self.ys_not_decomposed).values[-self.mem:])
                    time.sleep(5)
                    continue

                dec_tmp = FactorAnalysis(self.dec_comps)
                xs = dec_tmp.fit_transform(xs)
                self.decomposer = dec_tmp

                model_tmp = svm.SVR()
                model_tmp.fit(xs, ser.reshape(-1))
                self.model = model_tmp
                count_sleep = np.minimum(60, len(self.ys_not_decomposed))
                while count_sleep > 0 and not self.suicide:
                    time.sleep(1)
                    count_sleep -= 1
            else:
                time.sleep(5)
Пример #30
0
 def aic_select(self):
     '''
     using AIC to select model
     :return: None
     '''
     self.aic_b = True
     low = 99999
     for n in range(1, self.n_components + 1):
         fa = FactorAnalysis(n_components=n)
         fa.fit(self.data)
         dm = 2 * self.data.shape[1] * (n + 1) - n * (n + 1)
         #dm = 2*self.data.shape[1]*n
         aic = -2 * fa.score(self.data) * self.data.shape[0] + dm
         self.aic.append(aic)
         if self.aic[-1] < low:
             low = self.aic[-1]
             self.res_n = n
             self.model = deepcopy(fa)
     # print('------aic-------\n', self.aic)
     # self.res_n = self.aic.index(low) + 1
     print('selected components:', self.res_n, '\n')