def train_eval(self, train_index, test_index, ignore_eval=False):
        normalized_train, normalized_test = normalize_by_train(self.source[train_index], self.source[test_index])

        if self.comp is not None:
            if self.use_scikit is not None:
                if self.use_scikit == 'cca':
                    dim_reduction = CCA(n_components=self.comp)
                else:
                    dim_reduction = PCA(n_components=self.comp)
                # fit cca according to train data only
                dim_reduction.fit(normalized_train, self.target[train_index])
                # convert source into lower dimensional representation
                normalized_train = dim_reduction.transform(normalized_train)
                normalized_test = dim_reduction.transform(normalized_test)
            else:
                _, wa, _ = tutorial_on_cca(normalized_train, self.target[train_index])
                normalized_train = normalized_train @ wa[:, :self.comp]
                normalized_test = normalized_test @ wa[:, :self.comp]

        model = self.build_model()


        model.fit(normalized_train, self.target[train_index])

        prediction = model.predict(normalized_test)

        # res_df.to_csv(f"{self.out_name}/res1.csv")
        if not ignore_eval:
            return self.evaluate_regression(prediction, test_index)
        else:
            return prediction
Пример #2
0
class CCA_method():
    def __init__(self, n_latents):

        self._n_latents = n_latents
        self._cca = CCA(n_components=n_latents,
                        scale=False,
                        max_iter=10000,
                        tol=1e-8)
        self._Q = np.eye(self._n_latents)

    def fit(self, X, Y):

        # projections U'X, V'Y such that U'X and V'Y are maximally correlated
        self._cca.fit(X, Y)

        # get time-course of projected data
        UX, VY = self._cca.transform(X, Y)

        # learn linear regression VY = UX * Q
        # (Q will be optimal in least-squares sense)
        self._Q = np.linalg.pinv(UX).dot(VY)

    def predict(self, X):

        # transform source data into latent space
        UX = self._cca.transform(X)

        # predict latent activity in target space
        QUX = UX.dot(self._Q)

        # predict observed activity in target space
        Ypred = QUX.dot(self._cca.y_loadings_.T)

        return Ypred
    def compCorrCoefs(self, learningSet, EEGSignals):
        n_components = 1
        cca = CCA(n_components)
        #print(EEGSignals.shape)
        '''
        correlation14 = abs(np.corrcoef(np.mean(learningSet[0:3].T, axis=1),np.mean(EEGSignals.T, axis=1))[0, 1])
        correlation28 = abs(np.corrcoef(np.mean(learningSet[3:6].T, axis=1),np.mean(EEGSignals.T, axis=1))[0, 1])
        correlation8 = abs(np.corrcoef(np.mean(learningSet[6:9].T, axis=1),np.mean(EEGSignals.T, axis=1))[0, 1])

        print(learningSet[0][0],learningSet[1][0],learningSet[2][0])
        for i in range(0,9,3):
            print(abs(np.corrcoef(learningSet[i].T,EEGSignals[int(i/3)].T)[0, 1]),
                  abs(np.corrcoef(learningSet[i+1].T,EEGSignals[int(i/3)].T)[0, 1]),
                  abs(np.corrcoef(learningSet[i+2].T,EEGSignals[int(i/3)].T)[0, 1]))
        print("---")
        '''

        cca.fit(learningSet[0:3].T, EEGSignals.T)
        U, V = cca.transform(learningSet[0:3].T, EEGSignals.T)
        correlation14 = abs(np.corrcoef(U.T, V.T)[0, 1])

        cca.fit(learningSet[3:6].T, EEGSignals.T)
        U, V = cca.transform(learningSet[3:6].T, EEGSignals.T)
        correlation28 = abs(np.corrcoef(U.T, V.T)[0, 1])

        cca.fit(learningSet[6:9].T, EEGSignals.T)
        U, V = cca.transform(learningSet[6:9].T, EEGSignals.T)
        correlation8 = abs(np.corrcoef(U.T, V.T)[0, 1])

        return correlation14, correlation28, correlation8
def CCA_transform(train_feature, train_label, test_feature, n_components):
    """ CCA: Canonical Correlation Analysis
    """
    from sklearn.cross_decomposition import CCA
    cca = CCA(n_components).fit(train_feature, train_label)
    
    train_feature_transformed = cca.transform(train_feature)
    test_feature_transformed = cca.transform(test_feature)
    
    return train_feature_transformed, test_feature_transformed
def CCA_transform(train_feature, train_label, test_feature, n_components):
    """ CCA: Canonical Correlation Analysis
    """
    from sklearn.cross_decomposition import CCA
    cca = CCA(n_components).fit(train_feature, train_label)

    train_feature_transformed = cca.transform(train_feature)
    test_feature_transformed = cca.transform(test_feature)

    return train_feature_transformed, test_feature_transformed
Пример #6
0
def cca(vocab1, vocab2, cca_model=None, dim=300, max_iter=1000, thre=0.5):
    if not cca_model:
        cca_model = CCA(n_components=dim, max_iter=max_iter)
        try:
            cca_model.fit(vocab1, vocab2)
            [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2)
        except:
            print('svd cannot converge, try smaller dim')
    else:
        [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2)
    comb_cca = (thre * cca_vec1 + (1 - thre) * cca_vec2)
    return comb_cca, cca_vec1, cca_vec2, cca_model
Пример #7
0
def CalculaCCA(data):
    samplingRate = 500

    data_filtered = butter_bandpass_filter(data, 4.0, 35.0, samplingRate)
    data_filtered = data

    data_notch = butter_bandstop_filter(data_filtered, 58.0, 62.0,
                                        samplingRate, 4)

    numpyBuffer = np.array(data_notch)
    size = np.shape(data_notch)

    freq1 = getReferenceSignals(size[1], 5, samplingRate)
    freq2 = getReferenceSignals(size[1], 7, samplingRate)
    freq3 = getReferenceSignals(size[1], 9, samplingRate)
    freq4 = getReferenceSignals(size[1], 11, samplingRate)

    cca = CCA(n_components=1)

    cca.fit(numpyBuffer.T, freq1.T)
    O1_a, O1_b = cca.transform(numpyBuffer.T, freq1.T)
    result1 = np.corrcoef(O1_a.T, O1_b.T)[0, 1]

    cca.fit(numpyBuffer.T, freq2.T)
    O1_a, O1_b = cca.transform(numpyBuffer.T, freq2.T)
    result2 = np.corrcoef(O1_a.T, O1_b.T)[0, 1]

    cca.fit(numpyBuffer.T, freq3.T)
    O1_a, O1_b = cca.transform(numpyBuffer.T, freq3.T)
    result3 = np.corrcoef(O1_a.T, O1_b.T)[0, 1]

    cca.fit(numpyBuffer.T, freq4.T)
    O1_a, O1_b = cca.transform(numpyBuffer.T, freq4.T)
    result4 = np.corrcoef(O1_a.T, O1_b.T)[0, 1]

    result = [abs(result1), abs(result2), abs(result3), abs(result4)]
    ab = max(result, key=float)

    if (abs(result1) == ab):
        value = 5

    if (abs(result2) == ab):
        value = 7

    if (abs(result3) == ab):
        value = 9

    if (abs(result4) == ab):
        value = 11

    return value
Пример #8
0
def compute_mcc(args, config):
    rep1 = pickle.load(
        open(
            os.path.join(args.checkpoints, 'seed{}'.format(args.seed),
                         'test_representations.p'), 'rb'))['rep']
    rep2 = pickle.load(
        open(
            os.path.join(args.checkpoints, 'seed{}'.format(args.second_seed),
                         'test_representations.p'), 'rb'))['rep']

    # cutoff = 50 if args.dataset == 'CIFAR100' else 5
    # ii = np.where(res_cond[0]['lab'] < cutoff)[0]  # in sample points to learn from
    # iinot = np.where(res_cond[0]['lab'] >= cutoff)[0]  # out of sample points
    cutoff = 5000  # half the test dataset
    ii = np.arange(cutoff)
    iinot = np.arange(cutoff, 2 * cutoff)

    mcc_strong_out = mean_corr_coef_out_of_sample(x=rep1[ii],
                                                  y=rep2[ii],
                                                  x_test=rep1[iinot],
                                                  y_test=rep2[iinot])
    mcc_strong_in = (mean_corr_coef(x=rep1[ii], y=rep2[ii]))

    pickle.dump({
        'in': mcc_strong_in,
        'out': mcc_strong_out
    },
                open(
                    os.path.join(
                        args.output,
                        'mcc_strong_{}_{}.p'.format(args.seed,
                                                    args.second_seed)), 'wb'))

    cca_dim = 20
    cca = CCA(n_components=cca_dim)
    cca.fit(rep1[ii], rep2[ii])
    res_out = cca.transform(rep1[iinot], rep2[iinot])
    mcc_weak_out = mean_corr_coef(res_out[0], res_out[1])
    res_in = cca.transform(rep1[ii], rep2[ii])
    mcc_weak_in = mean_corr_coef(res_in[0], res_in[1])

    pickle.dump({
        'in': mcc_weak_in,
        'out': mcc_weak_out
    },
                open(
                    os.path.join(
                        args.output,
                        'mcc_weak_{}_{}.p'.format(args.seed,
                                                  args.second_seed)), 'wb'))
Пример #9
0
def compute_mcc(rep1,
                rep2,
                weak=False,
                cca_dim=None,
                return_cca_outputs=False):
    # print(rep1.shape)
    assert rep1.shape == rep2.shape
    cutoff = rep1.shape[0] // 2
    ii = np.arange(cutoff)
    iinot = np.arange(cutoff, 2 * cutoff)

    # in sample and out of sample mcc
    mcc_strong_out = mean_corr_coef_out_of_sample(x=rep1[ii],
                                                  y=rep2[ii],
                                                  x_test=rep1[iinot],
                                                  y_test=rep2[iinot])
    mcc_strong_in = (mean_corr_coef(x=rep1[ii], y=rep2[ii]))

    # pickle.dump({'in': mcc_strong_in, 'out': mcc_strong_out},
    #             open(os.path.join(args.output, 'mcc_strong_{}_{}.p'.format(args.seed, args.second_seed)), 'wb'))

    print("MCC strong: in {:.4f}, out {:.4f}.".format(mcc_strong_in,
                                                      mcc_strong_out))
    # this computes in and out of sample mcc after applying CCA.
    # NB: if the number of samples is too small with respect to the size of the embedding, this does not work correctly!
    if weak:
        if cca_dim is None:
            cca_dim = rep1.shape[1]
        cca = CCA(n_components=cca_dim)
        cca.fit(
            rep1[ii], rep2[ii]
        )  # this raises an error if the shape[1] of the tensors is smaller than cca_dim
        res_out = cca.transform(rep1[iinot], rep2[iinot])
        # this does not make much sense as when it computes the weak MCC it still looks for the best permutation of
        # them, which does not make sense I believe. However that in practice does not matter, as the diagonal elements
        # are selected, which denote the best ones.
        mcc_weak_out = mean_corr_coef(res_out[0], res_out[1])
        res_in = cca.transform(rep1[ii], rep2[ii])
        mcc_weak_in = mean_corr_coef(res_in[0], res_in[1])
        print("MCC weak: in {:.4f}, out {:.4f}.".format(
            mcc_weak_in, mcc_weak_out))
        if return_cca_outputs:
            return mcc_strong_in, mcc_strong_out, mcc_weak_in, mcc_weak_out, res_out[
                0], res_out[1]
        else:
            return mcc_strong_in, mcc_strong_out, mcc_weak_in, mcc_weak_out

    return mcc_strong_in, mcc_strong_out
Пример #10
0
def cca_classify(X_eeg_signals, Yi_frequency_signals):
    cca = CCA(1)
    corr_results = []
    for fr in range(0, Yi_frequency_signals.shape[0]):
        X = X_eeg_signals
        Yi = Yi_frequency_signals[fr, :, :]
        #计算X与Yi之间的相关性
        cca.fit(X.T, np.squeeze(Yi).T)
        X_train_r, Yi_train_r = cca.transform(X.T, np.squeeze(Yi).T)
        corr = np.corrcoef(X_train_r[:, 0], Yi_train_r[:, 0])[0, 1]
        #得出X与每个Yi的相关性
        corr_results.append(corr)
    if corr_results[np.argmax(corr_results)] > 0.50:
        #设置阈值
        global index
        global all_data
        classify_result = np.argmax(corr_results) + 1
        print(corr_results)
        index += 1
        #保存数据
        TT = pd.DataFrame(X_eeg_signals)
        all_data = all_data.append(np.transpose(TT[1:9]))
        if index == 50:
            #保存数据
            all_data = pd.DataFrame(all_data)
            all_data.to_csv('./j_8_all_data.csv', index=False)
        return classify_result
    else:
        return -1
Пример #11
0
def main(args):
    (training_file, label_file, test_file, u_file, e, c, output_file,
     components) = args
    X_training = load_feat(training_file)
    n = len(X_training)
    U = load_feat(u_file)
    y_training = [float(line.strip()) for line in open(label_file)]

    U = np.asarray(U)
    X_training = np.asarray(X_training)
    #X = preprocessing.normalize(X, norm='l2')
    y_training = np.asarray(y_training)

    X_test = load_feat(test_file)
    y_test = [float(line.strip()) for line in open(test_label)]
    X_test = np.asarray(X_test)
    X_test[np.isnan(X_test)] = 0.0
    #test_X = preprocessing.normalize(test_X, norm='l2')
    y_test = np.asarray(y_test)
    s = min(len(X_training), len(U))

    cca = CCA(n_components=components, max_iter=50)
    (X_cca, U_cca) = cca.fit_transform(X_training[:s], U[:s])
    X_test_cca = cca.transform(X_test)

    svr = SVR(C=c, epsilon=e, kernel='rbf')
    svr.fit(X_cca, y_training[:s])
    pred = svr.predict(X_test_cca)

    with open(output_file, 'w') as output:
        for p in pred:
            print >> output, p
    return
Пример #12
0
    def map_spaces(self, algo, src_mapped_embed=None, trg_mapped_embed=None):

        # (There may be duplicates in self.shared_vocab_src and/or self.shared_vocab_trg,
        # swap_vocab can be used to only inspect one-to-one translations)
        src_embed = self.model_src[self.shared_vocab_src]
        trg_embed = self.model_trg[self.shared_vocab_trg]

        os.makedirs(algo, exist_ok=True)

        if algo == "procrustes":
            logging.info(
                "Calculating Rotation Matrix (Procrustes Problem) and applying it to first embedding"
            )
            #ortho, _ = orthogonal_procrustes(src_embed, trg_embed)
            # does the same as
            u, _, vt = np.linalg.svd(trg_embed.T.dot(src_embed))
            w = vt.T.dot(u.T)
            self.model_src.vectors.dot(w, out=self.model_src.vectors)

        elif algo == "noise":
            logging.info(
                "Calculating Rotation Matrix with noise aware algorithm and applying it to first embedding"
            )
            transform_matrix, alpha, clean_indices, noisy_indices = noise_aware(
                src_embed, trg_embed)
            #write cleaned vocab to file
            with open("vocab.clean.txt", 'w') as v:
                for src, trg in np.asarray(self.shared_vocab)[clean_indices]:
                    v.write("{}\t{}\n".format(src, trg))
            self.model_src.vectors.dot(transform_matrix,
                                       out=self.model_src.vectors)
            logging.info("Percentage of clean indices: {}".format(alpha))

        elif algo == "cca":
            logging.info(
                "Calculating Mapping based on CCA and applying it to both embeddings"
            )
            cca = CCA(n_components=100, max_iter=5000)
            cca.fit(src_embed, trg_embed)
            self.model_src.vectors, self.model_trg.vectors = cca.transform(
                self.model_src.vectors, self.model_trg.vectors)

        elif algo == "gcca":
            logging.info(
                "Calculating Mapping based on GCCA and applying it to both embeddings"
            )
            gcca = GCCA()
            gcca.fit([src_embed, trg_embed])
            transform_l = gcca.transform_as_list(
                (self.model_src.vectors, self.model_trg.vectors))
            # gcca computes positive and negative correlations (eigenvalues), sorted in ascending order.
            # We are only interested in the positive portion
            self.model_src.vectors = transform_l[0][:, 100:]
            self.model_trg.vectors = transform_l[1][:, 100:]

        # save transformed model(s)
        if src_mapped_embed:
            self.model_src.save(os.path.join(algo, src_mapped_embed))
        if trg_mapped_embed:
            self.model_trg.save(os.path.join(algo, trg_mapped_embed))
Пример #13
0
def find_correlation_cca_method1(signal, reference_signals, n_components=2):
    r"""
    Perform canonical correlation analysis (CCA)
    Reference: https://github.com/aaravindravi/Brain-computer-interfaces/blob/master/notebook_12_class_cca.ipynb

    Args:
        signal : ndarray, shape (channel,time)
            Input signal in time domain
        reference_signals : ndarray, shape (len(flick_freq),2*num_harmonics,time)
            Required sinusoidal reference templates corresponding to the flicker frequency for SSVEP classification
        n_components : int, default: 2
            number of components to keep (for sklearn.cross_decomposition.CCA)
    Returns:
        result : array, size: len(flick_freq)
            Probability for each reference signals
    Dependencies:
        CCA : sklearn.cross_decomposition.CCA
        np : numpy package
    """

    cca = CCA(n_components)
    corr = np.zeros(n_components)
    result = np.zeros(reference_signals.shape[0])
    for freq_idx in range(0, reference_signals.shape[0]):
        cca_x = signal.T
        cca_y = np.squeeze(reference_signals[freq_idx, :, :]).T
        cca.fit(cca_x, cca_y)
        a, b = cca.transform(cca_x, cca_y)
        for ind_val in range(0, n_components):
            corr[ind_val] = np.corrcoef(a[:, ind_val], b[:, ind_val])[0, 1]
        result[freq_idx] = np.max(corr)
    return result
Пример #14
0
def CCA_project_vectors(args,
                        src_dico,
                        tgt_dico,
                        src_full,
                        tgt_full,
                        src_train,
                        tgt_train,
                        NUM_dim=100):

    print('Exporting embeddings...')
    OutputDir = "output/{}-{}/".format(args.src_lang, args.tgt_lang)
    if not os.path.exists(OutputDir):
        os.makedirs(OutputDir)

    cca = CCA(n_components=NUM_dim)
    print("Fitting...")
    cca.fit(src_train, tgt_train)
    print(cca.get_params())
    X_c, Y_c = cca.transform(src_full, tgt_full)
    src_out, tgt_out = utils.norm_embeddings(X_c), utils.norm_embeddings(Y_c)
    print("Exporting embeddings...")
    utils.export_embeddings(src_dico[0], src_out,
                            OutputDir + 'projected.{}'.format(args.src_lang))
    utils.export_embeddings(tgt_dico[0], tgt_out,
                            OutputDir + 'projected.{}'.format(args.tgt_lang))
    print("work over!")
Пример #15
0
def visualize_with_cca(X, y, title):
    cca = CCA(n_components=2)
    cca.fit(X, y)
    X_cca = cca.transform(X)
    Xax = X_cca[:, 0]
    Yax = X_cca[:, 1]
    labels = (y > 0).astype(int)
    cdict = {0: 'red', 1: 'green'}
    labl = {0: 'home_loss', 1: 'home_win'}
    marker = {0: '*', 1: 'o'}
    alpha = {0: .3, 1: .5}

    fig, ax = plt.subplots(figsize=(7, 5))
    fig.patch.set_facecolor('white')

    for l in np.unique(labels):
        ix = np.where(labels == l)
        ax.scatter(Xax[ix],
                   Yax[ix],
                   c=cdict[l],
                   s=40,
                   label=labl[l],
                   marker=marker[l],
                   alpha=alpha[l])

    plt.xlabel("First Principal Component", fontsize=14)
    plt.ylabel("Second Principal Component", fontsize=14)
    plt.legend()
    plt.title(title)
    plt.show()
Пример #16
0
class CCAFusion(TransformerMixin, BaseEstimator):
    def __init__(self, c1, c2):
        self.pipes = [c1, c2]
        self.max_iter = 500
        self.cca = None

    def fit(self, X, y=None, **fit_params):
        C = []
        n_components = None
        for pipe in self.pipes:
            c = pipe.fit_transform(X, y)
            if hasattr(c, 'toarray'):
                c = c.toarray()
            if n_components is None:
                n_components = c.shape[1]
            else:
                n_components = min(c.shape[1], n_components)
            C += [c]
        self.cca = CCA(n_components=n_components, max_iter=self.max_iter)
        self.cca.fit(*C)
        return self

    def transform(self, X, y=None):
        C = []
        for pipe in self.pipes:
            c = pipe.transform(X, y)
            if hasattr(c, 'toarray'):
                c = c.toarray()
            C += [c]
        return self.cca.transform(*C)[0]

    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X, y, **fit_params).transform(X, y)
def cca(m1, m2, preprocessing=None):
    """
    Use CCA to decompose two views and plot result.

    Params:
        m1, m2: Every column is a example with every row as a feature.
        preprocessing: If None, we don't do pre-processing; if 'orth', we adjust center to 0 and perform PCA.
    """
    # Adjust means to be 0 and perform PCA.
    if preprocessing == "orth":
        # Zero means.
        m1 -= np.mean(m1, axis=1, keepdims=True)

        # print("m1=", np.sum(m1, axis=1))
        m2 -= np.mean(m2, axis=1, keepdims=True)

        # PCA.

    cca = CCA(n_components=3, max_iter=100)
    cca.fit(m1.T, m2.T)

    X_c = cca.transform(m1.T)

    fig, ax = plt.subplots()
    ax.set_title('Fig.2.(c)')
    # ax.set_color_cycle(['blue', 'green', 'red'])
    ax.set_prop_cycle('color', ['blue', 'red', 'green'])
    ax.plot(X_c)
    # ax.plot(Y_c)
    plt.show()
Пример #18
0
def main(args):
    (training_file, label_file, test_file, u_file, e, c, output_file, components) = args
    X_training = load_feat(training_file)
    n = len(X_training)
    U = load_feat(u_file)
    y_training = [float(line.strip()) for line in open(label_file)]
   
    U = np.asarray(U)
    X_training = np.asarray(X_training)
    #X = preprocessing.normalize(X, norm='l2')
    y_training = np.asarray(y_training)
    
    X_test = load_feat(test_file)
    y_test = [float(line.strip()) for line in open(test_label)]
    X_test = np.asarray(X_test)
    X_test[np.isnan(X_test)] = 0.0
    #test_X = preprocessing.normalize(test_X, norm='l2')
    y_test = np.asarray(y_test)
    s = min(len(X_training), len(U))

    
    cca = CCA(n_components=components, max_iter=50)
    (X_cca, U_cca) = cca.fit_transform(X_training[:s], U[:s])
    X_test_cca = cca.transform(X_test)
    
    svr = SVR(C=c, epsilon=e, kernel='rbf')
    svr.fit(X_cca, y_training[:s])    
    pred = svr.predict(X_test_cca)
    
 
    with open(output_file, 'w') as output:
        for p in pred:
            print >>output, p
    return
Пример #19
0
    def fit_cca(self, outfile=''):

        # fits linear CCA constraint and replaces pretrained name embeddings with CCA transformed embeddings

        self.load_embeddings()
        self.extract_pretrained_prototype_embeddings()

        items, vectors = zip(
            *[(k, v) for k, v in self.pretrained_prototype_embeddings.items()
              if k in self.exemplar_to_concept])
        concept_embs = Reach(vectors, items)

        train_vectors = []
        for x in items:
            train_vectors.append(self.train_embeddings[x])
        train_vectors = Reach.normalize(train_vectors)

        cca = CCA(n_components=self.train_embeddings.size, max_iter=10000)
        cca.fit(train_vectors, concept_embs.norm_vectors)

        # transform all name embeddings using the CCA mapping
        all_name_embeddings = deepcopy(self.pretrained_name_embeddings)
        items = [x for _, x in sorted(all_name_embeddings.indices.items())]
        projected_name_embeddings = cca.transform(
            all_name_embeddings.norm_vectors)
        new_name_embeddings = Reach(projected_name_embeddings, items)

        self.pretrained_name_embeddings = new_name_embeddings
        self.load_embeddings()

        if outfile:
            with open('{}_cca.p', 'wb') as f:
                pickle.dump(cca, f)
Пример #20
0
def project_vectors(origForeignVecFile,
                    origEnVecFile,
                    subsetEnVecFile,
                    subsetForeignVecFile,
                    outputEnFile,
                    outputForeignFile,
                    NUMCC=40):
    '''
    将词典的向量输入到CCA中,生成投影向量,再生成双语向量
    :param origForeignVecFile: 外语向量矩阵
    :param origEnVecFile: 英语向量矩阵
    :param subsetEnVecFile: 词典中的英语向量矩阵
    :param subsetForeignVecFile: 词典中的外语向量矩阵
    :param outputEnFile: 重新获得的英语词向量
    :param outputForeignFile: 重新获得的外语词向量
    :param truncRatio: 模型的训练系数
    '''
    '''数据读入,处理掉开头的英文单词,只保留词向量'''

    tmp = np.loadtxt(origEnVecFile, dtype=np.str, delimiter=' ')
    origEnVecs = tmp[:, 1:].astype(np.float)
    tmp2 = np.loadtxt(origForeignVecFile, dtype=np.str, delimiter=' ')
    origForeignVecs = tmp2[:, 1:].astype(np.float)
    tmp3 = np.loadtxt(subsetEnVecFile, dtype=np.str, delimiter=' ')
    subsetEnVecs = tmp3[:, 1:].astype(np.float)
    tmp4 = np.loadtxt(subsetForeignVecFile, dtype=np.str, delimiter=' ')
    subsetForeignVecs = tmp4[:, 1:].astype(np.float)
    '''预处理,使每行正则化'''
    #origEnVecs=preprocessing.normalize(origEnVecs)
    #origForeignVecs=preprocessing.normalize(origForeignVecs)
    subsetEnVecs = preprocessing.normalize(subsetEnVecs)
    subsetForeignVecs = preprocessing.normalize(subsetForeignVecs)
    '''训练CCA'''
    '''
    num = [NUMCC]
    regs = [1e-1]
    cca = rcca.CCACrossValidate(regs=regs,numCCs=num,kernelcca=False,cutoff=0.1)
    cca.train([subsetEnVecs, subsetForeignVecs])
    '''
    cca = CCA(n_components=NUMCC)
    cca.fit(subsetEnVecs, subsetForeignVecs)
    print cca.get_params()
    X_c, Y_c = cca.transform(origEnVecs, origForeignVecs)
    '''生成投影后的向量'''
    #tmpOutput = rcca._listdot([d.T for d in [origEnVecs, origForeignVecs]], cca.ws)
    origEnVecsProjected = preprocessing.normalize(X_c)
    #origEnVecsProjected = preprocessing.scale(tmpOutput[0])
    origEnVecsProjected = np.column_stack(
        (tmp[:, :1], origEnVecsProjected.astype(np.str)))
    origForeignVecsProjected = preprocessing.normalize(Y_c)
    #origForeignVecsProjected = preprocessing.scale(tmpOutput[1])
    origForeignVecsProjected = np.column_stack(
        (tmp2[:, :1], origForeignVecsProjected.astype(np.str)))
    np.savetxt(outputEnFile, origEnVecsProjected, fmt="%s", delimiter=' ')
    np.savetxt(outputForeignFile,
               origForeignVecsProjected,
               fmt="%s",
               delimiter=' ')
    print "work over!"
Пример #21
0
def cca_analysis(X, Y, X_dev, Y_dev):
	cca = CCA(n_components=1, max_iter=2000)
	cca.fit(X, Y)
	X_dev_c, Y_dev_c = cca.transform(X_dev, Y_dev)

	corrcoef = np.corrcoef(X_dev_c.T, Y_dev_c.T)[0,1]

	return corrcoef
Пример #22
0
def cca_d_h(d_var, h_var, components_num):
	cca=CCA(n_components=components_num, scale=True, max_iter=2000)
	cca.fit(d_var, h_var)
	d_c,h_c=cca.transform(d_var, h_var)  
	ah = np.linalg.inv((h_var.T).dot(h_var)).dot(h_var.T).dot(h_c)
	ad = np.linalg.inv((d_var.T).dot(d_var)).dot(d_var.T).dot(d_c)

	return d_c, h_c, ad, ah
Пример #23
0
def _CCA(data, graph, n):
    cca = CCA(n_components=n)
    adjacencyMatrix = createAffinityMatrix(graph)
    cca.fit(data, adjacencyMatrix)
    X_c, Y_c = cca.transform(data, adjacencyMatrix)

    writeCSV(X_c, 'CCA_X')
    writeCSV(Y_c, 'CCA_Y')
Пример #24
0
def cca_score(X, Y):
    # Calculate the CCA score of the first component pair
    ca = CCA(n_components=1)
    ca.fit(X, Y)
    Xc, Yc = ca.transform(X, Y)
    score = np.corrcoef(Xc[:, 0], Yc[:, 0])

    return score[0][1]
Пример #25
0
 def mean_canonical_correlations(scaled_features, df):
     
     cca = CCA(1)
     cca.fit(scaled_features, df.iloc[:,-1])
     X_c, Y_c = cca.transform(scaled_features, df.iloc[:,-1])
     
     mean_canonical_correlation = np.mean(X_c)
     return mean_canonical_correlation
Пример #26
0
    def CCA_analysis(self, recordings_index, trial_index, brain_area):
        path = self.all_data_path + '/' + self.selected_recordings[
            recordings_index]

        #Prepare rates
        rates = self.convert_one_population_to_rates(recordings_index,
                                                     trial_index, brain_area).T

        #Prepare behavior
        trials = np.load(path + '/' + 'trials.intervals.npy')
        #Behavioral data
        mot_timestamps = np.load(path + '/' + 'face.timestamps.npy')
        mot_energy = np.load(path + '/' + 'face.motionEnergy.npy')

        beh_range = np.bitwise_and(
            mot_timestamps[:, 1] >= trials[trial_index][0],
            mot_timestamps[:, 1] <= trials[trial_index][1])
        #print(np.where(beh_range==True))
        #print(mot_timestamps[beh_range])
        beh_subset = mot_energy[beh_range]

        beh_subset_aligned = self.align_rate_and_behavior(
            beh_subset, rates[:, 0]).reshape(-1, 1)

        from sklearn.cross_decomposition import CCA

        cca = CCA(n_components=2)
        cca.fit(rates, beh_subset_aligned)
        X_train_r, Y_train_r = cca.transform(rates, beh_subset_aligned)
        print(X_train_r.shape)
        print(Y_train_r.shape)
        plt.scatter(X_train_r[:, 0],
                    Y_train_r[:],
                    label="train",
                    marker="*",
                    c="b",
                    s=50)

        plt.show()

        plt.scatter(X_train_r[:, 1],
                    Y_train_r[:],
                    label="train",
                    marker="*",
                    c="b",
                    s=50)

        plt.show()
        #rates_test=self.convert_one_population_to_rates(recordings_index,2,brain_area).T

        #X_test_r, Y_test_r = cca.transform(rates_test, beh_subset_aligned)
        #plt.scatter(X_test_r[:, 0], Y_test_r[:], label="test",
        #marker="^", c="b", s=50)

        #plt.show()

        print(beh_subset_aligned.shape)
        print(rates.shape)
Пример #27
0
    def solve(self):
        v1, v2 = self.list_view
        clf = CCA(n_components=self.m_rank)
        clf = clf.fit(v1.T, v2.T)
        self.model = clf

        X_c, Y_c = clf.transform(v1.T, v2.T)
        self.list_projection = [X_c, Y_c]
        self.list_U = [clf.x_rotations_, clf.y_rotations_]
Пример #28
0
def cca(src_dict, tgt_dict, bi_dict, dim=250):

    #with open('../data/seed_embedding.dat', 'wb') as f:
    #    pickle.dump(x, f)
    #    pickle.dump(y, f)
    cca_model = CCA(n_components=dim)
    src_mat, tgt_mat = make_training_matrices(src_dict, tgt_dict, bi_dict)
    cca_model.fit(src_mat, tgt_mat)
    return cca_model.transform(src_dict.embed, tgt_dict.embed)
Пример #29
0
    def fit_cal(self,F):             
        
        # PlsDemo
        PD_m=PlsDemo(self.x_m_cal,self.y_cal,self.max_folds,self.max_components)       
        W_m,T_m,P_m,coefs_B_m,RMSECV_m,min_RMSECV_m,comp_best=PD_m.pls_fit(F) 
#         print "comp_best =", comp_best
        
        cca_m = CCA(comp_best)
        cca_m.fit(self.x_m_cal, self.y_cal)
        X_score, Y_score = cca_m.transform(self.x_m_cal, self.y_cal)
        W_m = cca_m.x_rotations_      
        x_m_cal_mean = np.mean(self.x_m_cal, axis=0)
        x_m_cal_center = np.subtract(self.x_m_cal, x_m_cal_mean)
        L_m = np.dot(x_m_cal_center, W_m)
        
        cca_s = CCA(comp_best)
        cca_s.fit(self.x_s_std, self.y_std)
        X_score, Y_score = cca_s.transform(self.x_s_std, self.y_std)
        W_s = cca_s.x_rotations_
        x_s_std_mean = np.mean(self.x_s_std, axis=0)
        x_s_std_center = np.subtract(self.x_s_std, x_s_std_mean)
        L_s = np.dot(x_s_std_center, W_s)
        
#         print "L.shape =", np.shape(L_m),np.shape(L_s)
        
        F_1 = np.linalg.lstsq(L_s, L_m)[0]
        F_2 = np.linalg.lstsq(L_m, self.x_m_std)[0]

        coefficient = np.dot(np.dot(np.dot(W_s, F_1), F_2), coefs_B_m)
        
        #RMSEC
#         xs_std_center=np.subtract(self.x_s_std, self.x_s_std.mean(axis=0))
# #         xs_std_center=np.subtract(self.x_s_std, self.x_m_cal.mean(axis=0))
#         y_predict=np.dot(xs_std_center, coefficient)+self.y_cal.mean(axis=0)
#         RMSEC=np.sqrt(np.sum(np.square(np.subtract(y_predict,self.y_std)),axis=0)/self.y_std.shape[0])
#         print "RMSEC =", RMSEC
        
        xs_cal_center=np.subtract(self.x_s_cal, self.x_s_cal.mean(axis=0))
#         xs_cal_center=np.subtract(self.x_s_cal, self.x_m_cal.mean(axis=0))
        y_predict=np.dot(xs_cal_center, coefficient)+self.y_cal.mean(axis=0)
        RMSEC=np.sqrt(np.sum(np.square(np.subtract(y_predict,self.y_cal)),axis=0)/self.y_cal.shape[0])

        return coefficient, comp_best, RMSEC
Пример #30
0
 def cca_feature(self, data, parameter_list):
     cca = CCA(1)
     result = []
     for i in range(parameter_list[-1]):
         # reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3])
         reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3])
         cca.fit(data.T, reference_signals.T)
         x, y = cca.transform(data.T, np.squeeze(reference_signals).T)
         corr = np.corrcoef(x[:, 0], y[:, 0])[0, 1]
         result.append(corr)
     return result
Пример #31
0
    def predict(self):
        if self.k_CCA is None:
            if self.verbose: print('Going to compute best components first')
            self.determine_CCA_components()

        # self.cca_predictions, _ = self.ccaCV.predict(self.features, self.ccaCV.ws)
        cca = CCA(n_components=self.k_CCA)
        cca.fit(self.features[:6000], self.graph[:6000])
        self.cca_predictions = cca.transform(self.features)
        if self.verbose:
            print('Produced predictions')
            print('Size of predictions {}'.format(self.cca_predictions.shape))
Пример #32
0
def load_mutation_data():
    if os.path.isfile(mutation_pickle_path):
        pickle_load = pickle.load(open(mutation_pickle_path, 'rb'))
        return pickle_load[0], pickle_load[1]

    gene_effect_df = pd.read_csv(
        r"C:\Users\Nitay\Documents\courses\roded-seminar\Achilles_gene_dependency.csv"
    )
    mutations_df = pd.read_csv(
        r"C:\Users\Nitay\Documents\courses\roded-seminar\CCLE_mutations.csv")

    mutations_df = mutations_df[mutations_df["isDeleterious"].fillna(False)]

    gene_effect_df = gene_effect_df.set_index("Unnamed: 0").T
    gene_effect_df.columns.names = ["cell_line"]
    gene_effect_df.index.names = ["gene"]

    def clean_gene_name(name):
        return name.split("(")[0].strip()

    clean_gene_effect_df = gene_effect_df.rename(index=clean_gene_name)

    common_genes = set(clean_gene_effect_df.index).intersection(
        set(mutations_df['Hugo_Symbol']))
    mutations_cell_line = set(mutations_df['DepMap_ID'])

    new_mutations_df = pd.DataFrame(np.zeros(
        (len(common_genes), len(mutations_cell_line))),
                                    columns=mutations_cell_line,
                                    index=common_genes)
    for i, row in mutations_df.iterrows():
        cell_line = row["DepMap_ID"]
        gene = row['Hugo_Symbol']
        if gene in common_genes and cell_line in mutations_cell_line:
            new_mutations_df.loc[gene, cell_line] = 1

    filtered_gene_effect_df = clean_gene_effect_df.filter(items=common_genes,
                                                          axis=0)
    filtered_mutations_df = new_mutations_df.loc[new_mutations_df.sum(1) > 0,
                                                 new_mutations_df.sum(0) > 0]

    from sklearn.cross_decomposition import CCA
    Y = filtered_gene_effect_df.values
    X = filtered_mutations_df.values
    cca = CCA(n_components=10)
    cca.fit(X, Y)
    X_c = cca.transform(X)
    filtered_mutations_df = pd.DataFrame(X_c)

    pickle.dump([filtered_gene_effect_df, filtered_mutations_df],
                open(mutation_pickle_path, "wb"))

    return filtered_gene_effect_df, filtered_mutations_df
Пример #33
0
def canonical_approach():
    from sklearn.cross_decomposition import CCA

    (X, Y), cities = pull_xy_data()

    cca = CCA(n_components=2)
    cca.fit(X, Y)

    ccaX, ccaY = cca.transform(X, Y)

    plot(ccaX, cities, ["CC01", "CC02", "CC03"], 1)

    return "OK What Now?"
Пример #34
0
def cca_for_ssvep(input_data, sampling_rate, compared_frequencies):

    # TODO: Strick input checks, exceptions and avoid crashing and processing errors

    # Pre-allocate SSVEP signals matrix to be compared with original EEG recordings using CCA
    number_time_points = input_data.shape[1]
    number_harmonics = 2
    cca_base_signal_matrix = [[] for loop_var in compared_frequencies]

    # Pre-allocate output: one correlation coefficient (Rho) for each target SSVEP frequency
    # Note: Row 1 is for default Rho scores, Row 2 is for the Rho scores After cca transformation
    cca_rho_values = numpy.zeros([1, len(compared_frequencies)], dtype='float')

    # For each target frequency, fill Y matrix with sine and cosine signals for every harmonic
    for loop_frequencies in range(len(compared_frequencies)):

        # For this current SSVEP frequency, pre-allocate the harmonics matrix
        cca_base_signal_matrix[loop_frequencies] = numpy.zeros([number_harmonics * 2, number_time_points])
        time_points_count = numpy.arange(number_time_points, dtype='float')
        time_points_count = time_points_count / sampling_rate

        # Generate sine and cosine reference signals, for every harmonic
        for loop_harmonics in range(number_harmonics):

            # Compute the reference signals for current harmonic
            base_constant = 2 * numpy.pi * (loop_harmonics + 1) * compared_frequencies[loop_frequencies]
            base_sine_signal = numpy.sin((base_constant * time_points_count))
            base_cosine_signal = numpy.cos((base_constant * time_points_count))

            # Copy signals back to reference matrix
            base_position = loop_harmonics + 1
            sine_position = (2 * (base_position - 1) + 1)
            cosine_position = 2 * base_position
            cca_base_signal_matrix[loop_frequencies][sine_position - 1, :] = base_sine_signal
            cca_base_signal_matrix[loop_frequencies][cosine_position - 1, :] = base_cosine_signal

        # After the loop, extract the y_matrix from reference matrix for current SSVEP frequency
        y_matrix = cca_base_signal_matrix[loop_frequencies]

        # Create a CCA object and compute the correlation score
        cca_object = CCA(n_components=number_harmonics)
        cca_object.fit(numpy.transpose(input_data), numpy.transpose(y_matrix))
        values_x, values_y = cca_object.transform(input_data, y_matrix)
        cca_rho_values[0, loop_frequencies] = cca_object.score(input_data, y_matrix, values_y)   # Score = Rho value?

    # After loop return and exit
    return cca_rho_values
def mainExec(name_file1, name_file2, features1, features2):
    '''
    Given two files with names, and two files with features, perform the Stacked Auxiliary Embedding method
    on two matrices. The first one is the concatenation of both feature lists, the second matrix contains tf-idf weighted
    representations of the training sentences of Flickr30kEntities. The intermediate CCA model is written to disk,
    as well as the final model
    :param name_file1
    :param name_file2
    :param features1
    :param features2
    '''
    print "Creating vocabulary"
    voc = readVocabulary()
    print "Generating document vectors"
    occurrenceVectors, idf = createOccurrenceVectors(voc)
    print "Weighing vectors"
    weightedVectors = weight_tfidf(occurrenceVectors, idf)
    print "creating feature dictionary"
    featuresDict = createFeatDict(weightedVectors.keys(), name_file1, name_file2, features1, features2 )
    imagematrix, sentenceMatrix = createSnippetMatrices(featuresDict, weightedVectors)

    print "Modelling cca"
    cca = CCA(n_components = 128)
    cca = fitCCA(cca, imagematrix, sentenceMatrix, "ccasnippetmodel.p")

    trainingimages, trainingsentences = createTrainMatrices(voc)
    trans_img, trans_sent = cca.transform(trainingimages, trainingsentences)

    nn_img = nearest_neighbor(trainingimages)
    nn_sent = nearest_neighbor(trainingsentences)
    print "NN Image: " + str(nn_img)
    print "NN Sentence: " + str(nn_sent)
    augmented_imgs, augmented_sentences = augmentMatrices(nn_img, nn_sent, trainingimages, trainingsentences, trans_img,
                                                          trans_sent)
    print "Fitting augmented CCA model"
    augmentedcca = CCA(n_components=96)
    augmentedcca = fitCCA(augmentedcca, augmented_imgs, augmented_sentences, "augmentedcca.p")
    print "Writing the model to disk"

    resultingModel = StackedCCAModel(nn_img, nn_sent, cca, augmentedcca)

    pickle.dump(resultingModel, open("completestackedCCAModel.p", 'w+'))
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5

pls2 = PLSRegression(n_components=3)
pls2.fit(X, Y)
print("True B (such that: Y = XB + Err)")
print(B)
# compare pls2.coef_ with B
print("Estimated B")
print(np.round(pls2.coef_, 1))
pls2.predict(X)

# PLS regression, with univariate response, a.k.a. PLS1

n = 1000
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
pls1 = PLSRegression(n_components=3)
pls1.fit(X, y)
# note that the number of components exceeds 1 (the dimension of y)
print("Estimated betas")
print(np.round(pls1.coef_, 1))

# #############################################################################
# CCA (PLS mode B with symmetric deflation)

cca = CCA(n_components=2)
cca.fit(X_train, Y_train)
X_train_r, Y_train_r = cca.transform(X_train, Y_train)
X_test_r, Y_test_r = cca.transform(X_test, Y_test)
def mainExec(name_file, features):
    '''
    Based on a list of image names and image features, learn a CCA model based on Stacked Auxiliary Embedding and
    save this model to disk.
    :param name_file
    :param features
    :return:
    '''
    print "Creating vocabulary"
    voc = readVocabulary()
    print "Generating document vectors"
    occurrenceVectors, idf = createOccurrenceVectors(voc)
    print "Weighing vectors"
    weightedVectors = weight_tfidf(occurrenceVectors, idf)

    sentenceMatrix = []
    imagematrix = []
    print "Creating matrices"
    currentSentence = 0
    for i in weightedVectors.keys():
        if isLargeEnough(i):
            currentSentence += 1
            print "current Sentence: " + str(currentSentence)
            for j in range(len(weightedVectors[i])):
                weightedVectors[i][j] = float(weightedVectors[i][j])
            if currentSentence == 1:
                sentenceMatrix = weightedVectors[i]
                imagematrix = getImage(i,name_file, features)
            elif currentSentence ==2:
                sentenceMatrix = np.concatenate(([sentenceMatrix], [weightedVectors[i]]), axis = 0)
                imagematrix = np.concatenate(([imagematrix], [getImage(i,name_file, features)]), axis = 0)
            else:
                sentenceMatrix = np.concatenate((sentenceMatrix, [weightedVectors[i]]), axis = 0)
                imagematrix = np.concatenate((imagematrix, [getImage(i,name_file, features)]), axis = 0)

    print "Modelling cca"
    cca = CCA(n_components=128)
    cca.fit(sentenceMatrix, imagematrix)
    pickle.dump(cca, open("ccasnippetmodel.p",'w+'))

    idf = np.zeros(len(voc))
    trainingimages = []
    trainingsentences = []
    dp = getDataProvider('flickr30k')
    currentPair = 0
    for pair in dp.sampleImageSentencePair():
        currentPair += 1
        if currentPair % 100 == 0:
            print "Current pair: " + str(currentPair)
        img = pair['image']['feat']
        trainingimages.append(img)
        sentence = getFullSentence(pair)
        for i in range(len(sentence)):
            if sentence[i] > 0:
                idf[i] += 1
        trainingsentences.append(sentence)
    for i in range(len(trainingsentences)):
        trainingsentences[i] = trainingsentences[i]*idf

    trans_img, trans_sent = cca.transform(trainingimages, trainingsentences)
    nn_img = nearest_neighbor(trainingimages)
    nn_sent = nearest_neighbor(trainingsentences)

    augmented_imgs = []
    augmented_sentences = []
    for i in range(len(trans_img)):
        augm_img = trainingimages[i].extend(phi(3000,nn_img, trans_img[i]))
        augmented_imgs.append(augm_img)

    for i in range(len(trans_sent)):
        augm_sent = trainingsentences[i].extend(phi(3000, nn_sent, trans_sent[i]))
        augmented_sentences.append(augm_sent)

    augmentedcca = CCA(n_components= 96)
    augmentedcca.fit(augmented_sentences, augmented_imgs)

    pickle.dump(cca, open("augmentedcca.p",'w+'))
Пример #38
0
        os.makedirs(dir_name)

    OutputLog().set_path(dir_name)
    OutputLog().set_verbosity(configuration.output_parameters['verbosity'])

    data_config = ConfigParser.ConfigParser()
    data_config.read(data_set_config)
    data_parameters = ConfigSectionMap("dataset_parameters", data_config)

    # construct data set
    data_set = Container().create(data_parameters['name'], data_parameters)

    cca_model = CCA(n_components=top, scale=True, copy=False)

    train_transformed_x, train_transformed_y = cca_model.fit_transform(data_set.trainset[0], data_set.trainset[1])
    test_transformed_x, test_transformed_y = cca_model.transform(data_set.testset[0], data_set.testset[1])

    OutputLog().write('test results:')
    correlations, trace_correlation, var, x_test, y_test, test_best_layer = TraceCorrelationTester(
        data_set.testset[0],
        data_set.testset[1], top).test(IdentityTransformer(), configuration.hyper_parameters)

    OutputLog().write('train results:')
    correlations, train_trace_correlation, var, x_train, y_train, train_best_layer = TraceCorrelationTester(
        data_set.trainset[0],
        data_set.trainset[1], top).test(IdentityTransformer(), configuration.hyper_parameters)

    OutputLog().write('\nTest results : \n')

    configuration.hyper_parameters.print_parameters(OutputLog())
Пример #39
0
__author__ = 'cancobanoglu'
'''
 CCA is Canonical Correlation Analysis
'''

print(__doc__)

from sklearn.cross_decomposition import CCA
from sklearn import datasets

X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [3., 5., 4.]]
Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]

cca = CCA(n_components=1)
cca.fit(X, Y)

CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06)

X_c, Y_c = cca.transform(X, Y)
Пример #40
0
	# session.execute("DROP TABLE IF EXISTS Tweet")
	rows = session.execute("SELECT text, hashtags FROM Tweet limit 1000")
	X, Y = [], []
	for row in rows:
		X.append(row.text)
		Y.append([x.lower() for x in row.hashtags])
	vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', decode_error='ignore')
	# print(vectorizer)

	X = vectorizer.fit_transform(X).toarray()
	# print '40', X
	# print type(X)
	Y_indicator = LabelBinarizer().fit(Y).transform(Y)
	cca = CCA(n_components = 100, max_iter=10)
	cca.fit(X, Y_indicator)
	X = cca.transform(X)
	# print '45', X
	# print type(X)
	classif = OneVsRestClassifier(SVC(kernel='linear'))
	classif.fit(X, Y)

	for row in rows:
		# row = rows[0]
		# print vectorizer.transform([row.text]).toarray()
		# print cca.predict(vectorizer.transform([row.text]).toarray())
		transformed = vectorizer.transform([row.text]).toarray()
		# print '55', transformed
		ccad = cca.transform(transformed)
		# print '57', ccad
		predicts = classif.predict(ccad)
		if len(predicts) > 0:
Пример #41
0
# check type of array
#print(np.dtype(data_selection))

# force dtype = float32
data_selection = data_selection.astype(np.float32, copy=False)

# complete cases
data_selection = data_selection[~np.isnan(data_selection).any(axis=1)]
data_selection = data_selection[np.isfinite(data_selection).any(axis=1)]

# target variable / covariates
y = data_selection[:,0:3]
x = data_selection[:,4:]

# split test-train
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0)


cca = CCA(n_components=1,scale=True)
cca.fit(x_train, y_train)
#CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06),
X_train_r, Y_train_r = cca.transform(x_train,y_train)
X_test_r, Y_test_r = cca.transform(x_test, y_test)

print(type(X_train_r))
print(np.shape(X_train_r))
print(np.shape(Y_train_r))
print(np.shape(x))

print(np.corrcoef(X_train_r[:,0],Y_train_r[:,0]))
print(np.corrcoef(X_test_r[:,0],Y_test_r[:,0]))
Пример #42
0
    for i in range (5):
        plt.plot(nComponents,plsRegScores[i,:],lw=3)

    plt.xlim(1,np.amax(nComponents))
    plt.title('PLS Regression accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right')
    plt.grid(True)

if (0):
    #%% Canonical Correlation Analysis
    nComponents = np.arange(1,nClasses +1)
    cca = CCA(n_components=nClasses)
    cca.fit(Xtrain,Ytrain)
    XtrainT = cca.transform(Xtrain)
    XtestT = cca.transform(Xtest)
    ccaScores = np.zeros((5,np.alen(nComponents)))
    for i,n in enumerate(nComponents):
        ccaScores[:,i] = util.classify(XtrainT[:,0:n],XtestT[:,0:n],labelsTrain,labelsTest)
    
    cca = CCA(n_components=3)
    cca.fit(Xtrain,Ytrain)
    xt = cca.transform(Xtrain)
    fig = plt.figure()
    util.plotData(fig,xt,labelsTrain,classColors)
    plt.title('First 3 components of projected data')
    

    #%% Plot accuracies for CCA
    plt.figure()
Пример #43
0
class CCA_Model:
    def __init__(self,n_components):
        self.n_components = n_components
        self.cca = CCA(n_components=n_components)
        self.ntop  = 10


    def learn_model(self,X_chanel, Y_chanel,Y_Distinct=None):
        """

        :param X_chanel: array-like for X chanel
        :param Y_chanel: array-line for Y chanel
        :return:

        """
        print "Start learning..."

        self.x_dim  = len(X_chanel[0])
        self.y_dim = len(Y_chanel[0])
        self.cca.fit(X_chanel,Y_chanel)
        if Y_Distinct == None:
            self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_chanel)
        else:
            self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_Distinct)

        print "Learning completed"


    def get_bet_match_index_transform_x2y(self,x_transform):
        shape = self.Y_transform.shape
        scores = np.ndarray(shape[0],dtype=float)
        for i in xrange(shape[0]):
            scores[i] = np.dot(self.Y_transform[i],x_transform)
            #scores[i] = entropy(x_transform,self.Y_transform[i])

        indices = (-scores).argsort()[:self.ntop]
        return [indices, scores[indices]]


    def get_bet_match_index_transform_y2x(self,y_transform):
        shape = self.X_transform.shape
        scores = np.ndarray(shape[0], dtype=float)
        for i in xrange(shape[0]):
            scores[i] = np.dot(self.X_transform[i], y_transform)
            #scores[i] = entropy(y_transform,self.X_transform[i])
        indices = (-scores).argsort()[:self.ntop]

        return [indices, scores[indices]]

    def get_best_match_cross_indices_x2y(self,x_inputs):
        x_transformes = self.cca.transform(x_inputs)
        results = []
        for x_transform in x_transformes:
            results.append(self.get_bet_match_index_transform_x2y(x_transform))
        return results

    def get_best_match_cross_indices_y2x(self,y_inputs):
        _, y_transformes = self.cca.transform([[0 for i in xrange(self.x_dim)]],y_inputs)
        results = []
        for y_transform in y_transformes:
            results.append(self.get_bet_match_index_transform_y2x(y_transform))
        return results