示例#1
0
def FastICA_data(test_x, train_x, params):
    print 'centering data ...'
    center_test, center_train = center_data(test_x, train_x)

    print 'icaing data ...'
    components = int(params['components'])
    ica = FastICA(n_components=components, whiten=True).fit(train_x)
    ica_train_x = ica.transform(train_x)
    ica_test_x  = ica.transform(test_x)
    return ica_test_x, ica_train_x
示例#2
0
文件: ica.py 项目: kuntzer/sclas
class ICA(method.Method):
	
	def __init__(self, params):
		self.params = params
		self.ica = FastICA(**params)
	
	def __str__(self):
		return "FastICA"
		
	def train(self, data):
		"""
		Train the FastICA on the withened data
		
		:param data: whitened data, ready to use
		"""
		self.ica.fit(data)
	
	def encode(self, data):
		"""
		Encodes the ready to use data
		
		:returns: encoded data with dimension n_components
		"""
		return self.ica.transform(data)
	
	def decode(self, components):
		"""
		Decode the data to return whitened reconstructed data
		
		:returns: reconstructed data
		"""
		return self.ica.inverse_transform(components)
示例#3
0
文件: part2.py 项目: rbaxter1/CS7641
 def best_ica_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ica = FastICA(n_components=X_train_scl.shape[1])
     X_train_transformed = ica.fit_transform(X_train_scl, y_train)
     X_test_transformed = ica.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_ica_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_ica_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_ica_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_ica_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def main(mode):
    path = "/local/attale00/extracted_pascal__4__Multi-PIE"
    path_ea = path + "/color128/"

    allLabelFiles = utils.getAllFiles("/local/attale00/a_labels")

    labeledImages = [i[0:16] + ".png" for i in allLabelFiles]

    # labs=utils.parseLabelFiles(path+'/Multi-PIE/labels','mouth',labeledImages,cutoffSeq='.png',suffix='_face0.labels')
    labs = utils.parseLabelFiles(
        "/local/attale00/a_labels", "mouth", labeledImages, cutoffSeq=".png", suffix="_face0.labels"
    )

    testSet = fg.dataContainer(labs)
    roi = (50, 74, 96, 160)
    X = fg.getAllImagesFlat(path_ea, testSet.fileNames, (128, 256), roi=roi)

    # perform ICA
    if mode not in ["s", "v"]:
        ica = FastICA(n_components=100, whiten=True)
        ica.fit(X)
        meanI = np.mean(X, axis=0)
        X1 = X - meanI
        data = ica.transform(X1)
        filters = ica.components_

    elif mode in ["s", "v"]:
        W = np.load("/home/attale00/Desktop/classifiers/ica/filter1.npy")
        m = np.load("/home/attale00/Desktop/classifiers/ica/meanI1.npy")
        X1 = X - m
        data = np.dot(X1, W.T)

    for i in range(len(testSet.data)):
        testSet.data[i].extend(data[i, :])

    strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))

    # fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 3, cells_per_block=(6,2),maskFromAlpha=False)
    # fg.getColorHistogram(testSet,roi,path=path_ea,ending='.png',colorspace='lab',bins=10)
    testSet.targetNum = map(utils.mapMouthLabels2Two, testSet.target)

    rf = classifierUtils.standardRF(max_features=np.sqrt(len(testSet.data[0])), min_split=5, max_depth=40)
    if mode in ["s", "v"]:
        print "Classifying with loaded classifier"
        classifierUtils.classifyWithOld(
            path, testSet, mode, clfPath="/home/attale00/Desktop/classifiers/ica/rf128ICA_1"
        )
    elif mode in ["c"]:
        print "cross validation of data"
        print "Scores"
        # print classifierUtils.standardCrossvalidation(rf,testSet,n_jobs=5)
        # _cvDissect(testSet,rf)
        classifierUtils.dissectedCV(rf, testSet)
        print "----"

    elif mode in ["save"]:
        print "saving new classifier"
        _saveRF(testSet)
    else:
        print "not doing anything"
示例#5
0
def align(movie_data, options, args, lrh):
  print 'pICA(scikit-learn)'
  nvoxel = movie_data.shape[0]
  nTR    = movie_data.shape[1]
  nsubjs = movie_data.shape[2]

  align_algo = args.align_algo
  nfeature   = args.nfeature
  randseed    = args.randseed
  if not os.path.exists(options['working_path']):
    os.makedirs(options['working_path'])

  # zscore the data
  bX = np.zeros((nsubjs*nTR,nvoxel))
  for m in range(nsubjs):
    for t in range(nTR):
      bX[nTR*m+t,:] = stats.zscore(movie_data[:,t,m].T ,axis=0, ddof=1)
  del movie_data
 
  np.random.seed(randseed)
  A = np.mat(np.random.random((nfeature,nfeature)))

  ica = FastICA(n_components= nfeature, max_iter=500,w_init=A,random_state=randseed)
  ica.fit(bX.T)
  R = ica.transform(bX.T)

  niter = 10  
  # initialization when first time run the algorithm
  np.savez_compressed(options['working_path']+align_algo+'_'+lrh+'_'+str(niter)+'.npz',\
                                R = R,  niter=niter)
  return niter
示例#6
0
def ica(tx, ty, rx, ry):
    compressor = ICA(whiten=True)  # for some people, whiten needs to be off
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wICAtr", times=10)
    km(newtx, ty, newrx, ry, add="wICAtr", times=10)
    nn(newtx, ty, newrx, ry, add="wICAtr")
示例#7
0
文件: lazy.py 项目: ctw/eeglcf
def fastica(eeg_data):
    """
    Sample function to apply `FastICA`_ to the EEG data.

    Parameters
    ----------
    eeg_data : array
        EEG data in a CxTxE array. With C the number of channels, T the number
        of time samples and E the number of events.

    Returns
    -------
    ica : ICA object
        Trained `FastICA`_ object.
    ica_data : array
        EEG projected data in a CxTxE array. With C the number of components, T
        the number of time samples and E the number of events.
    """

    # Dimension shapes
    ch_len = eeg_data.shape[ch_dim]
    t_len = eeg_data.shape[t_dim]
    ev_len = eeg_data.shape[ev_dim]

    # -------------------------------------------------------------------------
    # 1. Fit the FastICA model

    # We need to collapse time and events dimensions
    coll_data = eeg_data.transpose([t_dim, ev_dim, ch_dim])\
        .reshape([t_len*ev_len, ch_len])

    # Fit model
    ica = FastICA()
    ica.fit(coll_data)

    # Normalize ICs to unit norm
    k = np.linalg.norm(ica.mixing_, axis=0)  # Frobenius norm
    ica.mixing_ /= k
    ica.components_[:] = (ica.components_.T * k).T

    # -------------------------------------------------------------------------
    # 2. Transform data

    # Project data
    bss_data = ica.transform(coll_data)

    # Adjust shape and dimensions back to "eeg_data" shape
    ic_len = bss_data.shape[1]
    bss_data = np.reshape(bss_data, [ev_len, t_len, ic_len])
    new_order = [0, 0, 0]
    # TODO: Check the following order
    new_order[ev_dim] = 0
    new_order[ch_dim] = 2
    new_order[t_dim] = 1
    bss_data = bss_data.transpose(new_order)

    # End
    return ica, bss_data
示例#8
0
def ICA(model_data, components = None, transform_data = None):
    t0 = time()
    ica = FastICA(n_components=components)
    if transform_data == None:
        projection = ica.fit_transform(model_data)
    else:
        ica.fit(model_data)
        projection = ica.transform(transform_data)
    print "ICA Time: %0.3f" % (time() - t0)
    return projection
示例#9
0
def var_test_ica(flux_arr_orig, exposure_list, wavelengths, low_n=3, hi_n=100, n_step=1, show_plots=False,
                    show_summary_plot=False, save_summary_plot=True, test_ind=7, real_time_progress=False,
                    idstr=None):
    start_ind = np.min(np.nonzero(flux_arr_orig[test_ind]))
    end_ind = np.max(np.nonzero(flux_arr_orig[test_ind]))

    perf_table = Table(names=["n", "avg_diff2", "max_diff_scaled"], dtype=["i4", "f4", "f4"])
    if hi_n > flux_arr_orig.shape[0]-1:
        hi_n = flux_arr_orig.shape[0]-1

    for n in range(low_n, hi_n, n_step):
        ica = FastICA(n_components = n, whiten=True, max_iter=750, random_state=1234975)
        test_arr = flux_arr_orig[test_ind].copy()

        flux_arr = np.vstack([flux_arr_orig[:test_ind], flux_arr_orig[test_ind+1:]])
        ica_flux_arr = flux_arr.copy()  #keep back one for testing
        ica.fit(ica_flux_arr)

        ica_trans = ica.transform(test_arr.copy(), copy=True)
        ica_rev = ica.inverse_transform(ica_trans.copy(), copy=True)

        avg_diff2 = np.ma.sum(np.ma.power(test_arr-ica_rev[0],2)) / (end_ind-start_ind)
        max_diff_scaled = np.ma.max(np.ma.abs(test_arr-ica_rev[0])) / (end_ind-start_ind)
        perf_table.add_row([n, avg_diff2, max_diff_scaled])

        if real_time_progress:
            print "n: {:4d}, avg (diff^2): {:0.5f}, scaled (max diff): {:0.5f}".format(n, avg_diff2, max_diff_scaled)

        if show_plots:
            plt.plot(wavelengths, test_arr)
            plt.plot(wavelengths, ica_rev[0])
            plt.plot(wavelengths, test_arr-ica_rev[0])

            plt.legend(['orig', 'ica', 'orig-ica'])
            plt.xlim((wavelengths[start_ind], wavelengths[end_ind]))

            plt.title("n={}, avg (diff^2)={}".format(n, avg_diff2))
            plt.tight_layout()
            plt.show()
            plt.close()

    if show_summary_plot or save_summary_plot:
        plt.plot(perf_table['n'], perf_table['avg_diff2'])
        plt.plot(perf_table['n'], perf_table['max_diff_scaled'])
        plt.title("performance")
        plt.tight_layout()
        if show_summary_plot:
            plt.show()
        if save_summary_plot:
            if idstr is None:
                idstr = random.randint(1000000, 9999999)
            plt.savefig("ica_performance_{}.png".format(idstr))
        plt.close()

    return perf_table
def ICA_reduction(posture, trainblock, componenet):
    currentdirectory = os.getcwd()  # get the directory.
    parentdirectory = os.path.abspath(currentdirectory + "/../..")  # Get the parent directory(2 levels up)
    path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-'+str(posture)+'/TrainBlock-'+str(trainblock)+''
    if not os.path.exists(path):
        os.makedirs(path)
    i_user = 1
    block = 1
    AUC = []
    while i_user <= 31:
        while block <= 6:
            train_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(trainblock)+"-GI.csv", dtype=float, delimiter=",")
            test_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(block)+"-GI.csv", dtype=float, delimiter=",")

            target_train = np.ones(len(train_data))
            row = 0
            while row < len(train_data):
                if np.any(train_data[row, 0:3] != [1, i_user, posture]):
                    target_train[row] = 0
                row += 1

            row = 0
            target_test = np.ones(len(test_data))
            while row < len(test_data):
                if np.any(test_data[row, 0:3] != [1, i_user, posture]):
                    target_test[row] = 0
                row += 1

            sample_train = train_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]]
            sample_test = test_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]]
            scaler = preprocessing.MinMaxScaler().fit(sample_train)
            sample_train_scaled = scaler.transform(sample_train)
            sample_test_scaled = scaler.transform(sample_test)

            ica = FastICA(n_components=componenet, max_iter=150)
            sample_train_ica = ica.fit(sample_train_scaled).transform(sample_train_scaled)
            sample_test_ica = ica.transform(sample_test_scaled)

            clf = ExtraTreesClassifier(n_estimators=100)
            clf.fit(sample_train_ica, target_train)

            prediction = clf.predict(sample_test_ica)
            auc = metrics.roc_auc_score(target_test, prediction)
            AUC.append(auc)

            block += 1

        block = 1
        i_user += 1
    print(AUC)
    AUC = np.array(AUC)
    AUC = AUC.reshape(31, 6)
    np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-"+str(posture)+"/TrainBlock-"+str(trainblock)+"/ICA-"+str(componenet)+"-Component.csv", AUC, delimiter=",")
def main(mode):
    path = '/local/attale00/AFLW_ALL/'
    path_ea = '/local/attale00/AFLW_cropped/mouth_img_error/'
#    
    fileNames = utils.getAllFiles(path_ea);

    
    labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels')
    
    testSet = fg.dataContainer(labs)
    components = 150
    roi=None
    X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(40,120),roi=roi)
#    X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(120,40),roi=roi,resizeFactor = .5)
# 
# perform ICA
    if mode not in ['s','v']:
        ica = FastICA(n_components=components,whiten=True)
        ica.fit(X)
        meanI=np.mean(X,axis=0)
        X1=X-meanI
        data=ica.transform(X1)
        filters=ica.components_
        
    elif mode in ['s','v']:
        W=np.load('/home/attale00/Desktop/classifiers/patches/filterMP1.npy')
        m=np.load('/home/attale00/Desktop/classifiers/patches/meanIMP1.npy')
        X1=X-m
        data=np.dot(X1,W.T)    
    
    for i in range(len(fileNames)):
            testSet.data[i].extend(data[i,:])
            
    print 'feature vector length: {}'.format(len(testSet.data[0]))

    testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target)
    rf=classifierUtils.standardRF(max_features = np.sqrt(len(testSet.data[0])),min_split=13,max_depth=40)
    #rf = svm.NuSVC()
    #rf = linear_model.SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None)    
    if mode in ['s','v']:
        print 'Classifying with loaded classifier'
        _classifyWithOld(path,testSet,mode)
    elif mode in ['c']:
        print 'cross validation of data'
        rValues = classifierUtils.dissectedCV(rf,testSet)
        pickle.dump(rValues,open('errorpatch_ica','w'))
    elif mode in ['save']:
        print 'saving new classifier'
        _saveRF(testSet,rf,filters=filters,meanI=meanI)
    else:
        print 'not doing anything'
def test_fit_transform():
    """Test FastICA.fit_transform"""
    rng = np.random.RandomState(0)
    X = rng.random_sample((100, 10))
    for whiten, n_components in [[True, 5], [False, 10]]:

        ica = FastICA(n_components=5, whiten=whiten, random_state=0)
        Xt = ica.fit_transform(X)
        assert_equal(ica.components_.shape, (n_components, 10))
        assert_equal(Xt.shape, (100, n_components))

        ica = FastICA(n_components=5, whiten=whiten, random_state=0)
        ica.fit(X)
        assert_equal(ica.components_.shape, (n_components, 10))
        Xt2 = ica.transform(X)

        assert_array_almost_equal(Xt, Xt2)
示例#13
0
def fastICA(X):

    from sklearn.decomposition import FastICA  # FastICAのライブラリ
    n, p = X.shape

    M = np.mean(X, axis=0)
    M_est = M

    X2 = X - M

    decomposer = FastICA(n_components=p)
    decomposer.fit(X2)

    A_est = decomposer.mixing_
    W_est = np.linalg.inv(A_est)
    S_est = decomposer.transform(X2)

    return S_est, W_est, M_est
def test_fit_transform():
    # Test FastICA.fit_transform
    rng = np.random.RandomState(0)
    X = rng.random_sample((100, 10))
    for whiten, n_components in [[True, 5], [False, None]]:
        n_components_ = (n_components if n_components is not None else
                         X.shape[1])

        ica = FastICA(n_components=n_components, whiten=whiten, random_state=0)
        Xt = ica.fit_transform(X)
        assert_equal(ica.components_.shape, (n_components_, 10))
        assert_equal(Xt.shape, (100, n_components_))

        ica = FastICA(n_components=n_components, whiten=whiten, random_state=0)
        ica.fit(X)
        assert_equal(ica.components_.shape, (n_components_, 10))
        Xt2 = ica.transform(X)

        assert_array_almost_equal(Xt, Xt2)
示例#15
0
def compute_PCA_ICA_NMF(n_components=5):
    spec_mean = spectra.mean(0)

    # PCA: use randomized PCA for speed
    pca = RandomizedPCA(n_components - 1)
    pca.fit(spectra)
    pca_comp = np.vstack([spec_mean,
                          pca.components_])

    # ICA treats sequential observations as related.  Because of this, we need
    # to fit with the transpose of the spectra
    ica = FastICA(n_components - 1)
    ica.fit(spectra.T)
    ica_comp = np.vstack([spec_mean,
                          ica.transform(spectra.T).T])

    # NMF requires all elements of the input to be greater than zero
    spectra[spectra < 0] = 0
    nmf = NMF(n_components)
    nmf.fit(spectra)
    nmf_comp = nmf.components_

    return pca_comp, ica_comp, nmf_comp
示例#16
0
    print('//===========================pca==========================')
    pca = PCA(n)
    traindata_pca = pca.fit_transform(traindata)
    testdata_pca = pca.transform(testdata)
    Faceidentifier(traindata_pca,trainlabel,testdata_pca,testlabel)

    print('//===========================sfa==========================')
    sfa = sfa.SFA()
    traindata_sfa = sfa.fit_transform(traindata.T,conponents =n).T
    testdata_sfa = sfa.transform(testdata.T).T
    Faceidentifier(traindata_sfa,trainlabel,testdata_sfa,testlabel)
    
    print('//===========================fastica==========================')
    fastica = FastICA(n)
    traindata_fastica = fastica.fit_transform(traindata)
    testdata_fastica = fastica.transform(testdata)
    Faceidentifier(traindata_fastica,trainlabel,testdata_fastica,testlabel)
    
    for i in range(0,9):
        if i == 0:
            b = 0.1
        elif i == 1:
            b = 0.2
        elif i == 2:
            b = 0.5
        elif i == 3:
            b = 0.8
        elif i == 4:
            b = 1
        elif i == 5:
            b = 2
示例#17
0
         'data': lda_transformed_data
     }
 for i in range(2, 9):
     estimators['untransformed_{}'.format(i)] = {
         'est': KMeans(n_clusters=i),
         'clusters': i,
         'data': data
     }
 pca_transformed_data = pca.transform(data)
 for i in range(2, 9):
     estimators['pca_transformed_{}'.format(i)] = {
         'est': KMeans(n_clusters=i),
         'clusters': i,
         'data': pca_transformed_data
     }
 ica_transformed_data = ica.transform(data)
 print 'kurt', kurtosis(ica_transformed_data)
 with open('outputs/ica_transformed_{}.csv'.format(SAMPLE_SIZE), 'w') as ica_data:
     writer = csv.writer(ica_data)
     for x, temp_y in zip(ica_transformed_data, y):
         x = list(x)
         x.append(temp_y)
         writer.writerow(x)
 for i in range(2, 9):
     estimators['ica_transformed_{}'.format(i)] = {
         'est': KMeans(n_clusters=i),
         'clusters': i,
         'data': ica_transformed_data
     }
 for i in range(8):
     print 'Random Proj'
示例#18
0
testSet = fg.dataContainer(labs)


roi=(0,37,0,115)

 
X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(37,115),roi=roi)
 
#        
# perform ICA
ica = FastICA(n_components=100,whiten=True)
ica.fit(X)
meanI=np.mean(X,axis=0)
X1=X-meanI
data=ica.transform(X1)
filters=ica.components_
for i in range(len(fileNames)):
    testSet.data[i].extend(data[i,:])

testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target)



###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
clf = RandomForestClassifier()

parameters = {'n_estimators': range(10, 40,20),
                  'max_depth': range(5, 40,5),
示例#19
0
# select 1000 random epochs from data
random.seed(3)
train_eeg_matrix = np.vstack(train_data[random.sample(range(n_train_epochs), 2500), :, :56])

# Compute ICA
ica = FastICA(n_components=train_eeg_matrix.shape[1], random_state=9)
# train on part of the data
ica.fit(train_eeg_matrix)
del train_eeg_matrix
log('ICA computed')

# 2d matrix with all training data we have
data_matrix = np.vstack(train_data[:, :, :])

train_data = ica.transform(data_matrix[:, :56])                    # transform channels to sources data
train_data = np.concatenate((train_data, data_matrix[:, 56:]), 1)  # append additional features
train_data = np.array_split(train_data, n_train_epochs)            # split to epochs
del data_matrix
log('train source data retrieved')


test_data, _ = load_data(folder_name, 'test')
test_data = np.array(get_windows(test_data, window_start, window_size))
n_test_epochs = test_data.shape[0]

# 2d matrix with all test data we have
data_matrix = np.vstack(test_data[:, :, :])

test_data = ica.transform(data_matrix[:, :56])                   # transform channels to sources data
test_data = np.concatenate((test_data, data_matrix[:, 56:]), 1)  # append additional features
示例#20
0
 def ica(self, whiten = True):
     ica = FastICA(n_components = 5, whiten = whiten)
     ica.fit(self.train)
     self.train = ica.transform(self.train)
     self.test = ica.transform(self.test)
def task1c(wine):
    data = pd.read_csv("winequality-" + wine + ".csv", sep=';')
    X = data.drop('quality', axis=1)
    y = data['quality']
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    pca = PCA()
    pca.fit(X)
    pca_data = pca.transform(X)
    per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
    labels = ['label' + str(x) for x in range(1, len(per_var) + 1)]
    plt.bar(x=range(1, 12), height=per_var, tick_label=labels)
    plt.ylabel('Percentage of Explained Variance')
    plt.xlabel('Principal Component')
    plt.title(wine + ' wine PCA Variance')
    plt.show()
    pca_df = pd.DataFrame(pca_data, columns=labels)

    plt.scatter(pca_df.label1, pca_df.label2, edgecolors='black')
    plt.xlabel('label1 - {0}%'.format(per_var[0]))
    plt.ylabel('label2 - {0}%'.format(per_var[1]))
    plt.title(wine + ' wine PCA label1 and label2')
    plt.show()
    plt.scatter(pca_df.label1, pca_df.label3, edgecolors='black')
    plt.xlabel('label1 - {0}%'.format(per_var[0]))
    plt.ylabel('label3 - {0}%'.format(per_var[2]))
    plt.title(wine + ' wine PCA label1 and label3')
    plt.show()
    plt.scatter(pca_df.label2, pca_df.label3, edgecolors='black')
    plt.xlabel('label2 - {0}%'.format(per_var[1]))
    plt.ylabel('label3 - {0}%'.format(per_var[2]))
    plt.title(wine + ' wine PCA label2 and label3')
    plt.show()

    print("correlation between label1 and quality: " +
          str(pca_df.label1.corr(y)))
    print("correlation between label2 and quality: " +
          str(pca_df.label2.corr(y)))
    print("correlation between label3 and quality: " +
          str(pca_df.label3.corr(y)))

    plt.clf()

    ica = FastICA(n_components=3)
    ica.fit(X)
    ica_data = ica.transform(X)
    labels = ['label' + str(x) for x in range(1, 4)]
    ica_df = pd.DataFrame(ica_data, columns=labels)

    plt.scatter(ica_df.label1, ica_df.label2, edgecolors='black')
    plt.xlabel('label1')
    plt.ylabel('label2')
    plt.title(wine + ' wine ICA label1 and label2')
    plt.show()
    plt.scatter(ica_df.label1, ica_df.label3, edgecolors='black')
    plt.xlabel('label1')
    plt.ylabel('label3')
    plt.title(wine + ' wine ICA label1 and label3')
    plt.show()
    plt.scatter(ica_df.label2, ica_df.label3, edgecolors='black')
    plt.xlabel('label2')
    plt.ylabel('label3')
    plt.title(wine + ' wine ICA label2 and label3')
    plt.show()

    print("correlation between label1 and quality: " +
          str(ica_df.label1.corr(y)))
    print("correlation between label2 and quality: " +
          str(ica_df.label2.corr(y)))
    print("correlation between label3 and quality: " +
          str(ica_df.label3.corr(y)))

    plt.clf()
    X = data.drop('quality', axis=1)
    y = data['quality']
    nmf = NMF(n_components=3, max_iter=10000)
    nmf.fit(X)
    nmf_data = nmf.transform(X)
    nmf_df = pd.DataFrame(nmf_data, columns=labels)

    plt.scatter(nmf_df.label1, nmf_df.label2, edgecolors='black')
    plt.xlabel('label1')
    plt.ylabel('label2')
    plt.title(wine + ' wine NMF label1 and label2')
    plt.show()
    plt.scatter(nmf_df.label1, nmf_df.label3, edgecolors='black')
    plt.xlabel('label1')
    plt.ylabel('label3')
    plt.title(wine + ' wine NMF label1 and label3')
    plt.show()
    plt.scatter(nmf_df.label2, nmf_df.label3, edgecolors='black')
    plt.xlabel('label2')
    plt.ylabel('label3')
    plt.title(wine + ' wine NMF label2 and label3')
    plt.show()

    print("correlation between label1 and quality: " +
          str(nmf_df.label1.corr(y)))
    print("correlation between label2 and quality: " +
          str(nmf_df.label2.corr(y)))
    print("correlation between label3 and quality: " +
          str(nmf_df.label3.corr(y)))

    print("end")
示例#22
0
# shape
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

from sklearn.decomposition import PCA, FastICA
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

y_train = train["y"]
y_mean = np.mean(y_train)

############################################### modeling #########################################
'''
model1=XGBRegressor(n_estimators=500,max_depth=4)
示例#23
0
class spectral_data(object):
    def __init__(self, df):

        try:
            uppercols = df.columns.levels[0]
            lowercols = list(df.columns.levels[1].values)
        except:
            df.columns = pd.MultiIndex.from_tuples(list(df.columns))
            uppercols = df.columns.levels[0]
            lowercols = list(df.columns.levels[1].values)

        for i, val in enumerate(lowercols):
            try:
                lowercols[i] = float(val)
            except:
                lowercols[i] = val

        levels = [uppercols, lowercols]
        df.columns.set_levels(levels, inplace=True)
        self.df = df

    def interp(self, xnew):
        xnew = np.array(xnew, dtype='float')

        metadata_cols = self.df.columns.levels[0] != 'wvl'
        metadata = self.df[self.df.columns.levels[0][metadata_cols]]
        old_wvls = np.array(self.df['wvl'].columns, dtype='float')
        old_spectra = np.array(self.df['wvl'])
        new_spectra = np.empty([len(old_spectra[:, 0]), len(xnew)]) * np.nan
        interp_index = (xnew > min(old_wvls)) & (xnew < max(old_wvls))

        f = sp.interpolate.interp1d(old_wvls, old_spectra, axis=1)
        new_spectra[:, interp_index] = f(xnew[interp_index])

        xnew = list(xnew)
        for i, x in enumerate(xnew):
            xnew[i] = ('wvl', x)

        new_df = pd.DataFrame(new_spectra, columns=pd.MultiIndex.from_tuples(xnew), index=self.df.index)
        new_df = pd.concat([new_df, metadata], axis=1)

        self.df = new_df

    def cal_tran(self, refdata, matchcol_ref, matchcol_transform, method, methodparams):
        C_matrix = []
        col = np.array([j.upper() for j in self.df[('meta', matchcol_transform)]])
        col_ref = np.array([j.upper() for j in refdata[('meta', matchcol_ref)]])
        for i in col:
            matches = np.where(col_ref == i, 1, 0)
            C_matrix.append(matches)

        C_matrix = np.transpose(np.array(C_matrix))

        if method == 'LRA - Low Rank Alignment':
            refdata_trans, transdata_trans = LRA(np.array(refdata['wvl']), np.array(self.df['wvl']), C_matrix,
                                                 methodparams['d'])
            refdata_trans = pd.DataFrame(refdata_trans)
            transdata_trans = pd.DataFrame(transdata_trans)
            pass
        if method == 'PDS Piecewise Direct Standardization':
            print('PDS not implemented yet!!')

        pass

    # This function masks out specified ranges of the data
    def mask(self, maskfile, maskvar='wvl'):
        df_spectra = self.df[maskvar]  # extract just the spectra from the data frame
        metadata_cols = self.df.columns.levels[0] != maskvar  # extract just the metadata
        metadata = self.df[self.df.columns.levels[0][metadata_cols]]

        mask = pd.read_csv(maskfile, sep=',')  # read the mask file
        tmp = []
        for i in mask.index:
            tmp.append((np.array(self.df[maskvar].columns, dtype='float') >= mask.ix[i, 'min_wvl']) & (
                np.array(self.df[maskvar].columns, dtype='float') <= mask.ix[i, 'max_wvl']))

        # combine the indexes for each range in the mask file into a single masking vector and use that to mask the spectra
        masked = np.any(np.array(tmp), axis=0)
        spectcols = list(df_spectra.columns)  # get the list of columns in the spectra dataframe
        for i, j in enumerate(masked):  # change the first level of the tuple from 'wvl' to 'masked' where appropriate
            if j == True:
                spectcols[i] = ('masked', spectcols[i])
            else:
                spectcols[i] = (maskvar, spectcols[i])
        df_spectra.columns = pd.MultiIndex.from_tuples(
            spectcols)  # assign the multiindex columns based on the new tuples
        self.df = pd.concat([df_spectra, metadata], axis=1)  # merge the masked spectra back with the metadata

    def multiply_vector(self, vectorfile):
        df_spectra = self.df['wvl']
        # TODO: check to make sure wavelengths match before multiplying

        vector = np.array(pd.read_csv(vectorfile, sep=',', header=None))[:, 1]
        if df_spectra.shape[1] == vector.shape[0]:
            self.df['wvl'] = df_spectra.multiply(vector, axis=1)
        else:
            print('Vector is not the same size as the spectra!')

    def peak_area(self, peaks_mins_file=None):
        df = self.df  # create a copy of the data
        wvls = df['wvl'].columns.values  # get the wavelengths

        if peaks_mins_file is not None:
            peaks_mins = pd.read_csv(peaks_mins_file, sep=',')
            peaks = peaks_mins['peaks']
            mins = peaks_mins['mins']
            pass
        else:
            ave_spect = np.average(np.array(df['wvl']), axis=0)  # find the average of the spectra in the data frame
            peaks = wvls[
                sp.signal.argrelextrema(ave_spect, np.greater_equal)[0]]  # find the maxima in the average spectrum
            mins = wvls[sp.signal.argrelextrema(ave_spect, np.less_equal)[0]]  # find the maxima in the average spectrum

        wvls = df['wvl'].columns.values  # get the wavelengths

        spectra = np.array(df['wvl'])
        for i in range(len(peaks)):

            # get the wavelengths between two minima
            try:
                low = mins[np.where(mins < peaks[i])[0][-1]]
            except:
                low = mins[0]

            try:
                high = mins[np.where(mins > peaks[i])[0][0]]
            except:
                high = mins[-1]

            peak_indices = np.all((wvls > low, wvls < high), axis=0)
            # plot.plot(wvls,ave_spect)
            # plot.plot(wvls[peak_indices],ave_spect[peak_indices])
            # plot.show()
            df[('peak_area', peaks[i])] = spectra[:, peak_indices].sum(axis=1)

        self.df = df
        return peaks, mins

    # This function divides the data up into a specified number of random folds
    def random_folds(self, nfolds=5, seed=10, groupby=None):
        self.df[('meta', 'Folds')] = np.nan  # Create an entry in the data frame that holds the folds
        foldslist = np.array(self.df[('meta', 'Folds')])
        if groupby == None:  # if no column name is listed to group on, just create random folds
            n = len(self.df.index)
            folds = cross_validation.KFold(n, nfolds, shuffle=True, random_state=seed)
            i = 1
            for train, test in folds:
                foldslist[test] = i
                i = i + 1

        else:
            # if a column name is provided, get all the unique values and define folds
            # so that all rows of a given value fall in the same fold
            # (this is useful to ensure that training and test data are truly independent)
            unique_inds = np.unique(self.df[groupby])
            folds = cross_validation.KFold(len(unique_inds), nfolds, shuffle=True, random_state=seed)
            foldslist = np.array(self.df[('meta', 'Folds')])
            i = 1
            for train, test in folds:
                tmp = unique_inds[test]
                tmp_full_list = np.array(self.df[groupby])
                tmp_ind = np.in1d(tmp_full_list, tmp)
                foldslist[tmp_ind] = i
                i = i + 1

        self.df[('meta', 'Folds')] = foldslist

    # this function divides the data up into a specified number of folds, using sorting
    # To try to get folds that look similar to each other
    def stratified_folds(self, nfolds=5, sortby=None):
        self.df[('meta', 'Folds')] = np.NaN  # Create an entry in the data frame that holds the folds
        self.df.sort_values(by=sortby, inplace=True)  # sort the data frame by the column of interest
        uniqvals = np.unique(self.df[sortby])  # get the unique values from the column of interest

        # assign folds by stepping through the unique values
        fold_num = 1
        for i in uniqvals:
            ind = self.df[sortby] == i  # find where the data frame matches the unique value
            self.df.set_value(self.df.index[ind], ('meta', 'Folds'), fold_num)
            # Inrement the fold number, reset to 1 if it is greater than the desired number of folds
            fold_num = fold_num + 1
            if fold_num > nfolds:
                fold_num = 1

        # sort by index to return the df to its original order
        self.df.sort_index(inplace=True)
        self.folds_hist(sortby,50)


    def folds_hist(self, col_to_plot, nbins, xlabel='wt.%', ylabel='# of spectra'):
        folds_uniq = np.unique(self.df[('meta', 'Folds')])
        for f in folds_uniq:
            temp = self.rows_match(('meta', 'Folds'), [f])
            vals = np.array(temp.df[col_to_plot])
            bins = np.linspace(0, np.max(vals), nbins)
            plot.hist(vals, linewidth=0.5, edgecolor='k')
            plot.xlabel(xlabel)
            plot.ylabel(ylabel)
            plot.title(str(col_to_plot[1]) + '- Fold ' + str(f))
            fig = plot.gcf()
            fig.savefig('hist_fold_' + str(f) + '_' + col_to_plot[1] + '.png')
            plot.close()

    # This function normalizes specified ranges of the data by their respective sums
    def norm(self, ranges, col_var='wvl'):
        df_tonorm = self.df[col_var]
        top_level_cols = self.df.columns.levels[0]
        top_level_cols = top_level_cols[top_level_cols != col_var]
        df_other = self.df[top_level_cols]
        cols = df_tonorm.columns.values

        df_sub_norm = []
        allind = []
        for i in ranges:
            # Find the indices for the range
            ind = (np.array(cols, dtype='float') >= i[0]) & (np.array(cols, dtype='float') <= i[1])
            # find the columns for the range
            normcols = cols[ind]
            # keep track of the indices used for all ranges
            allind.append(ind)
            # normalize over the current range
            df_sub_norm.append(norm_total(df_tonorm[normcols]))

        # collapse the list of indices used to a single array
        allind = np.sum(allind, axis=0)
        # identify columns that were not used by where the allind array is less than 1
        cols_excluded = cols[np.where(allind < 1)]
        # create a separate data frame containing the un-normalized columns
        df_masked = df_tonorm[cols_excluded]
        # combine the normalized data frames into one
        df_norm = pd.concat(df_sub_norm, axis=1)

        # make the columns into multiindex
        df_masked.columns = [['masked'] * len(df_masked.columns), df_masked.columns]
        df_norm.columns = [[col_var] * len(df_norm.columns), df_norm.columns.values]

        # combine the normalized data frames, the excluded columns, and the metadata into a single data frame
        df_new = pd.concat([df_other, df_norm, df_masked], axis=1)
        self.df = df_new

    # This function applies baseline removal to the data
    def remove_baseline(self, method='ALS', segment=True, params=None):
        wvls = np.array(self.df['wvl'].columns.values, dtype='float')
        spectra = np.array(self.df['wvl'], dtype='float')

        # set baseline removal object (br) to the specified method
        if method == 'ALS':
            br = ALS()
        elif method == 'Dietrich':
            br = Dietrich()
        elif method == 'Polyfit':
            br = PolyFit()
        elif method == 'AirPLS':
            br = AirPLS()
        elif method == 'FABC':
            br = FABC()
        elif method == 'KK':
            br = KK()
        elif method == 'Mario':
            br = Mario()
        elif method == 'Median':
            br = MedianFilter()
        elif method == 'Rubberband':
            br = Rubberband()
        elif method == 'CCAM':
            br = ccam_br()
            # if method == 'wavelet':
            #   br=Wavelet()
        else:
            print(method + ' is not recognized!')

        # if parameters are provided, use them to set the parameters of br
        if params is not None:
            for i in params.keys():
                try:
                    setattr(br, i, params[i])
                except:
                    print('Required keys are:')
                    print(br.__dict__.keys())
                    print('Exiting without removing baseline!')
                    return
        br.fit(wvls, spectra, segment=segment)
        self.df_baseline = self.df.copy()
        self.df_baseline['wvl'] = br.baseline
        self.df['wvl'] = self.df['wvl']-self.df_baseline['wvl']
    # This function finds rows of the data frame where a specified column has
    # values matching a specified set of values
    # (Useful for extracting folds)
    def rows_match(self, column_name, isin_array, invert=False):
        if invert:
            new_df = self.df.loc[-self.df[column_name].isin(isin_array)]
        else:
            new_df = self.df.loc[self.df[column_name].isin(isin_array)]
        return spectral_data(new_df)

    # This function takes the sum of data over two specified wavelength ranges,
    # calculates the ratio of the sums, and adds the ratio as a column in the data frame
    def ratio(self, range1, range2, rationame=''):
        cols = self.df['wvl'].columns.values
        cols1 = cols[(cols >= range1[0]) & (cols <= range1[1])]
        cols2 = cols[(cols >= range2[0]) * (cols <= range2[1])]

        df1 = self.df['wvl'].loc[:, cols1]
        df2 = self.df['wvl'].loc[:, cols2]

        sum1 = df1.sum(axis=1)
        sum2 = df2.sum(axis=1)

        ratio = sum1 / sum2

        self.df[('ratio', rationame)] = ratio

    def standard_scale(self, col):
        self.df[col] = StandardScaler().fit_transform(self.df[col])

    def deriv(self):
        new_df=self.df.copy()
        wvls=self.df['wvl'].columns.values
        new_df['wvl'] = self.df['wvl'].diff(axis=1)/wvls
        foo=new_df['wvl'].columns.values
        new_df=new_df.drop(('wvl',self.df['wvl'].columns.values[0]),axis=1)
        foo2=new_df['wvl'].columns.values
        return spectral_data(new_df)

    def dim_red(self, col, method, params, kws, load_fit=None):
        if method == 'PCA':
            self.do_dim_red = PCA(*params, **kws)
        if method == 'FastICA':
            self.do_dim_red = FastICA(*params, **kws)
        if method == 't-SNE':
            self.do_dim_red = TSNE(*params, **kws)
        if method == 'LLE':
            self.do_dim_red = LocallyLinearEmbedding(*params, **kws)
        if method == 'JADE-ICA':
            self.do_dim_red = JADE(*params, **kws)
        # TODO: Add ICA-JADE here
        if load_fit:
            self.do_dim_red = load_fit
        else:
            if method != 't-SNE':
                self.do_dim_red.fit(self.df[col])
                dim_red_result = self.do_dim_red.transform(self.df[col])
            else:
                dim_red_result = self.do_dim_red.fit_transform(self.df[col])

        for i in list(range(1, dim_red_result.shape[1] + 1)):  # will need to revisit this for other methods that don't use n_components to make sure column names still mamke sense
            self.df[(method, str(i))] = dim_red_result[:, i - 1]

        return self.do_dim_red

    def outlier_removal(self, col, method, params):
        if method == 'Isolation Forest':
            self.do_outlier_removal = IsolationForest(**params)
        else:
            method == None
        self.do_outlier_removal.fit(np.array(self.df[col]))
        outlier_scores = self.do_outlier_removal.decision_function(np.array(self.df[col]))
        self.df[('meta','Outlier Scores - '+method+str(params))] = outlier_scores
        #is_outlier = self.do_outlier_removal.predict(np.array(self.df[col]))
        #self.df[('meta', 'Outliers - ' + method + str(params))] = is_outlier

        return self.do_outlier_removal

    def pca(self, col, nc=None, load_fit=None):
        if nc:
            self.do_pca = PCA(n_components=nc)
            self.do_pca.fit(self.df[col])
        if load_fit:  # use this to load a previous fit rather than fit the current data
            self.do_pca = load_fit
        pca_result = self.do_pca.transform(self.df[col])
        for i in list(range(1, self.do_pca.n_components + 1)):
            self.df[('PCA', i)] = pca_result[:, i - 1]

    def ica(self, col, nc=None, load_fit=None):
        if nc:
            self.do_ica = FastICA(n_components=nc)
            self.do_ica.fit(self.df[col])
        if load_fit:  # use this to load a previous fit rather than fit the current data
            self.do_ica = load_fit
        ica_result = self.do_ica.transform(self.df[col])
        for i in list(range(1, self.do_ica.n_components + 1)):
            self.df[('ICA', i)] = ica_result[:, i - 1]


    def ica_jade(self, col, nc=None, load_fit=None, corrcols=None):
        if load_fit is not None:  # use this to load a previous fit rather than fit the current data
            scores = np.dot(load_fit, self.df[col])
        else:
            scores = jade(self.df[col].values, m=nc, verbose=False)
        loadings = np.dot(scores, self.df[col])

        icacols = []
        for i in list(range(1, len(scores[:, 0]) + 1)):
            if np.abs(np.max(loadings[i - 1, :])) < np.abs(
                    np.min(loadings[i - 1, :])):  # flip the sign if necessary to look nicer
                loadings[i - 1, :] = loadings[i - 1, :] * -1
                scores[i - 1, :] = scores[i - 1, :] * -1
            icacols.append(('ICA-JADE', i))
            self.df[('ICA-JADE', i)] = scores[i - 1, :].T
        self.ica_jade_loadings = loadings

        if corrcols:
            combined_cols = corrcols + icacols
            corrdf = self.df[combined_cols].corr().drop(icacols, 1).drop(corrcols, 0)
            ica_jade_ids = []
            for i in corrdf.loc['ICA-JADE'].index:
                tmp = corrdf.loc[('ICA-JADE', i)]
                match = tmp.values == np.max(tmp)
                ica_jade_ids.append(corrcols[np.where(match)[0]][1] + ' (r=' + str(np.round(np.max(tmp), 1)) + ')')
                pass
            self.ica_jade_corr = corrdf
            self.ica_jade_ids = ica_jade_ids

    def col_within_range(self, rangevals, col):
        mask = (self.df[('meta', col)] > rangevals[0]) & (self.df[('meta', col)] < rangevals[1])
        return self.df.loc[mask]

    def enumerate_duplicates(self, col):
        rows = self.df[('meta', col)]
        rows = rows.fillna('-')
        rows = [str(x) for x in rows]
        unique_rows = np.unique(rows)
        rows=np.array(rows)
        rows_list=list(rows)
        for i in unique_rows:
            if i is not '-':
                matchindex = np.where(rows == i)[0]

                if len(matchindex) > 1:
                    for n, name in enumerate(rows[matchindex]):
                        rows_list[matchindex[n]] = i+ ' - ' + str(n + 1)

        self.df[('meta', col)] = rows_list
def ica_original_25_components():
    filename = ("nba_original_ica_transformed_25d_matrix.npy")
    ica = FastICA(n_components=37, algorithm='deflation', max_iter=100)
    ica.fit(players_stat)
    transformed_data = ica.transform(players_stat)
    np.save(filename,transformed_data)
示例#25
0
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'],
               loc='lower right')
    plt.grid(True)

if (0):
    # ICA
    from sklearn.decomposition import FastICA

    nComponents = np.arange(1, nClasses + 1 + 50)
    icaScores = np.zeros((5, np.alen(nComponents)))
    for i, n in enumerate(nComponents):
        icaT = FastICA(n_components=n, max_iter=10000)
        icaT.fit(Xtrain, labelsTrain)
        XtrainT = icaT.transform(Xtrain)
        XtestT = icaT.transform(Xtest)
        icaScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain,
                                        labelsTest)

    ica = FastICA(n_components=3, max_iter=10000)
    ica.fit(Xtrain, labelsTrain)
    xt = ica.transform(Xtrain)
    fig = plt.figure()
    util.plotData(fig, xt[:, :3], labelsTrain, classColors)
    plt.title('First 3 components of projected data')

    #%% Plot accuracies for ICA
    plt.figure()
    for i in range(5):
        plt.plot(nComponents, icaScores[i, :], lw=3)
示例#26
0
    y_pred = clf.predict(X_test_pca)

    accuracies.append(float(np.sum(y_test == y_pred)) / len(y_pred))
    components.append(n_components)

    print('For ' + str(n_components) + ' components, accuracy is ' +
          str(float(np.sum(y_test == y_pred)) / len(y_pred)) +
          ' confusion matrix is: ')
    # print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
    # print(classification_report(y_test, y_pred, target_names=target_names))
    #############  ICA
    ica = FastICA(n_components=n_components)
    S_ = ica.fit_transform(X)
    A_ = ica.mixing_

    X_train_ica = ica.transform(X_train)
    X_test_ica = ica.transform(X_test)

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_ica, y_train)
    y_pred = clf.predict(X_test_ica)

    accuracies_ica.append(float(np.sum(y_test == y_pred)) / len(y_pred))
    components_ica.append(n_components)

    print('For ' + str(n_components) + ' components, accuracy is ' +
          str(float(np.sum(y_test == y_pred)) / len(y_pred)) +
示例#27
0
def get_dc_feature(df_train,
                   df_test,
                   n_comp=12,
                   id_column=None,
                   label_column=None):
    """
    构造分解特征
    """
    train = df_train.copy()
    test = df_test.copy()

    if id_column:
        train_id = train[id_column]
        test_id = test[id_column]
        train = drop_columns(train, [id_column])
        test = drop_columns(test, [id_column])
    if label_column:
        train_y = train[label_column]
        train = drop_columns(train, [label_column])

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results_train = tsvd.fit_transform(train)
    tsvd_results_test = tsvd.transform(test)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca2_results_train = pca.fit_transform(train)
    pca2_results_test = pca.transform(test)

    # ICA
    ica = FastICA(n_components=n_comp, random_state=420)
    ica2_results_train = ica.fit_transform(train)
    ica2_results_test = ica.transform(test)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results_train = grp.fit_transform(train)
    grp_results_test = grp.transform(test)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results_train = srp.fit_transform(train)
    srp_results_test = srp.transform(test)

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]

    if id_column:
        train[id_column] = train_id
        test[id_column] = test_id
    if label_column:
        train[label_column] = train_y

    return train, test
示例#28
0
        if labels[i] == predictions[i]:
            true += 1
        else:
            false += 1
    return (true / (false + true)) * 100


if __name__ == '__main__':
    train_images = transform_images(train_path)
    test_images = transform_images(test_path)

    x = []
    y = []

    for i in range(50, 500, 10):
        ica = FastICA(n_components=i, whiten=True)
        train = ica.fit(train_images[0]).transform(train_images[0])
        test = ica.transform(test_images[0])

        clsf = svm.SVC()
        clsf.fit(train, train_images[1])
        predvals = clsf.predict(test)

        x.append(i)
        y.append(benchmark(test_images[1], predvals))

    plt.plot(x, y)
    plt.xlabel('Number of independent components')
    plt.ylabel('Performance in %')
    plt.show()
示例#29
0
            (numDay - 2) * duration), ]
        Y_inner_trainingset = Y_arr[subtrain[train], ].reshape(
            (numDay - 2) * duration)
        X_validate = X_all[X_arr[subtrain[validate], ], ].reshape(
            duration, 600)
        Y_validate = Y_all[X_arr[subtrain[validate]]].reshape(duration)

        for C in components:
            print(test, subtrain[validate], subtrain[train], C)

            ica = FastICA(n_components=C, max_iter=5000,
                          tol=0.0001)  #tol = 0.001
            X_inner_train = ica.fit_transform(
                X_inner_trainingset
            )  #pull components from ica fit transformation
            X_inner_test = ica.transform(X_validate)

            clf = svm.SVC(kernel='linear',
                          class_weight='balanced',
                          probability=True)
            y_inner_score = clf.fit(
                X_inner_train,
                Y_inner_trainingset).decision_function(X_inner_test)
            fpr, tpr, _ = roc_curve(Y_validate, y_inner_score)
            roc_auc[t, v, c] = auc(fpr, tpr)

            c += 1
        v += 1

    best[t, 0] = int(np.argmax(np.mean(roc_auc[t, :, :], axis=0)))
    print('Best components | %0.0f' % (components[int(best[t, 0])]))
示例#30
0
def refineregressor(
    fmridata,
    fmritr,
    shiftedtcs,
    weights,
    passnum,
    lagstrengths,
    lagtimes,
    lagsigma,
    lagmask,
    R2,
    theprefilter,
    optiondict,
    padtrs=60,
    bipolar=False,
    includemask=None,
    excludemask=None,
    debug=False,
    rt_floatset=np.float64,
    rt_floattype="float64",
):
    """

    Parameters
    ----------
    fmridata : 4D numpy float array
       fMRI data
    fmritr : float
        Data repetition rate, in seconds
    shiftedtcs : 4D numpy float array
        Time aligned voxel timecourses
    weights :  unknown
        unknown
    passnum : int
        Number of the pass (for labelling output)
    lagstrengths : 3D numpy float array
        Maximum correlation coefficient in every voxel
    lagtimes : 3D numpy float array
        Time delay of maximum crosscorrelation in seconds
    lagsigma : 3D numpy float array
        Gaussian width of the crosscorrelation peak, in seconds.
    lagmask : 3D numpy float array
        Mask of voxels with successful correlation fits.
    R2 : 3D numpy float array
        Square of the maximum correlation coefficient in every voxel
    theprefilter : function
        The filter function to use
    optiondict : dict
        Dictionary of all internal rapidtide configuration variables.
    padtrs : int, optional
        Number of timepoints to pad onto each end
    includemask : 3D array
        Mask of voxels to include in refinement.  Default is None (all voxels).
    excludemask : 3D array
        Mask of voxels to exclude from refinement.  Default is None (no voxels).
    debug : bool
        Enable additional debugging output.  Default is False
    rt_floatset : function
        Function to coerce variable types
    rt_floattype : {'float32', 'float64'}
        Data type for internal variables

    Returns
    -------
    volumetotal : int
        Number of voxels processed
    outputdata : float array
        New regressor
    maskarray : 3D array
        Mask of voxels used for refinement
    """
    inputshape = np.shape(fmridata)
    if optiondict["ampthresh"] < 0.0:
        if bipolar:
            theampthresh = tide_stats.getfracval(np.fabs(lagstrengths),
                                                 -optiondict["ampthresh"],
                                                 nozero=True)
        else:
            theampthresh = tide_stats.getfracval(lagstrengths,
                                                 -optiondict["ampthresh"],
                                                 nozero=True)
        print(
            "setting ampthresh to the",
            -100.0 * optiondict["ampthresh"],
            "th percentile (",
            theampthresh,
            ")",
        )
    else:
        theampthresh = optiondict["ampthresh"]
    if bipolar:
        ampmask = np.where(
            np.fabs(lagstrengths) >= theampthresh, np.int16(1), np.int16(0))
    else:
        ampmask = np.where(lagstrengths >= theampthresh, np.int16(1),
                           np.int16(0))
    if optiondict["lagmaskside"] == "upper":
        delaymask = np.where(
            (lagtimes - optiondict["offsettime"]) > optiondict["lagminthresh"],
            np.int16(1),
            np.int16(0),
        ) * np.where(
            (lagtimes - optiondict["offsettime"]) < optiondict["lagmaxthresh"],
            np.int16(1),
            np.int16(0),
        )
    elif optiondict["lagmaskside"] == "lower":
        delaymask = np.where(
            (lagtimes - optiondict["offsettime"]) <
            -optiondict["lagminthresh"],
            np.int16(1),
            np.int16(0),
        ) * np.where(
            (lagtimes - optiondict["offsettime"]) >
            -optiondict["lagmaxthresh"],
            np.int16(1),
            np.int16(0),
        )
    else:
        abslag = abs(lagtimes) - optiondict["offsettime"]
        delaymask = np.where(abslag > optiondict["lagminthresh"], np.int16(1),
                             np.int16(0)) * np.where(
                                 abslag < optiondict["lagmaxthresh"],
                                 np.int16(1), np.int16(0))
    sigmamask = np.where(lagsigma < optiondict["sigmathresh"], np.int16(1),
                         np.int16(0))
    locationmask = lagmask + 0
    if includemask is not None:
        locationmask = locationmask * includemask
    if excludemask is not None:
        locationmask = locationmask * (1 - excludemask)
    locationmask = locationmask.astype(np.int16)
    print("location mask created")

    # first generate the refine mask
    locationfails = np.sum(1 - locationmask)
    ampfails = np.sum(1 - ampmask * locationmask)
    lagfails = np.sum(1 - delaymask * locationmask)
    sigmafails = np.sum(1 - sigmamask * locationmask)
    refinemask = locationmask * ampmask * delaymask * sigmamask
    if tide_stats.getmasksize(refinemask) == 0:
        print("ERROR: no voxels in the refine mask:")
        print(
            "\n	",
            locationfails,
            " locationfails",
            "\n	",
            ampfails,
            " ampfails",
            "\n	",
            lagfails,
            " lagfails",
            "\n	",
            sigmafails,
            " sigmafails",
        )
        if (includemask is None) and (excludemask is None):
            print("\nRelax ampthresh, delaythresh, or sigmathresh - exiting")
        else:
            print(
                "\nChange include/exclude masks or relax ampthresh, delaythresh, or sigmathresh - exiting"
            )
        return 0, None, None, locationfails, ampfails, lagfails, sigmafails

    if optiondict["cleanrefined"]:
        shiftmask = locationmask
    else:
        shiftmask = refinemask
    volumetotal = np.sum(shiftmask)
    reportstep = 1000

    # timeshift the valid voxels
    if optiondict["nprocs"] > 1:
        # define the consumer function here so it inherits most of the arguments
        def timeshift_consumer(inQ, outQ):
            while True:
                try:
                    # get a new message
                    val = inQ.get()

                    # this is the 'TERM' signal
                    if val is None:
                        break

                    # process and send the data
                    outQ.put(
                        _procOneVoxelTimeShift(
                            val,
                            fmridata[val, :],
                            lagstrengths[val],
                            R2[val],
                            lagtimes[val],
                            padtrs,
                            fmritr,
                            theprefilter,
                            optiondict["fmrifreq"],
                            refineprenorm=optiondict["refineprenorm"],
                            lagmaxthresh=optiondict["lagmaxthresh"],
                            refineweighting=optiondict["refineweighting"],
                            detrendorder=optiondict["detrendorder"],
                            offsettime=optiondict["offsettime"],
                            filterbeforePCA=optiondict["filterbeforePCA"],
                            psdfilter=optiondict["psdfilter"],
                            rt_floatset=rt_floatset,
                            rt_floattype=rt_floattype,
                        ))

                except Exception as e:
                    print("error!", e)
                    break

        data_out = tide_multiproc.run_multiproc(
            timeshift_consumer,
            inputshape,
            shiftmask,
            nprocs=optiondict["nprocs"],
            showprogressbar=True,
            chunksize=optiondict["mp_chunksize"],
        )

        # unpack the data
        psdlist = []
        for voxel in data_out:
            shiftedtcs[voxel[0], :] = voxel[1]
            weights[voxel[0], :] = voxel[2]
            if optiondict["psdfilter"]:
                psdlist.append(voxel[3])
        del data_out

    else:
        psdlist = []
        for vox in range(0, inputshape[0]):
            if (vox % reportstep == 0 or vox
                    == inputshape[0] - 1) and optiondict["showprogressbar"]:
                tide_util.progressbar(vox + 1,
                                      inputshape[0],
                                      label="Percent complete (timeshifting)")
            if shiftmask[vox] > 0.5:
                retvals = _procOneVoxelTimeShift(
                    vox,
                    fmridata[vox, :],
                    lagstrengths[vox],
                    R2[vox],
                    lagtimes[vox],
                    padtrs,
                    fmritr,
                    theprefilter,
                    optiondict["fmrifreq"],
                    refineprenorm=optiondict["refineprenorm"],
                    lagmaxthresh=optiondict["lagmaxthresh"],
                    refineweighting=optiondict["refineweighting"],
                    detrendorder=optiondict["detrendorder"],
                    offsettime=optiondict["offsettime"],
                    filterbeforePCA=optiondict["filterbeforePCA"],
                    psdfilter=optiondict["psdfilter"],
                    rt_floatset=rt_floatset,
                    rt_floattype=rt_floattype,
                )
                shiftedtcs[retvals[0], :] = retvals[1]
                weights[retvals[0], :] = retvals[2]
                if optiondict["psdfilter"]:
                    psdlist.append(retvals[3])
        print()

    if optiondict["psdfilter"]:
        print(len(psdlist))
        print(psdlist[0])
        print(np.shape(np.asarray(psdlist, dtype=rt_floattype)))
        averagepsd = np.mean(np.asarray(psdlist, dtype=rt_floattype), axis=0)
        stdpsd = np.std(np.asarray(psdlist, dtype=rt_floattype), axis=0)
        snr = np.nan_to_num(averagepsd / stdpsd)

    # now generate the refined timecourse(s)
    validlist = np.where(refinemask > 0)[0]
    refinevoxels = shiftedtcs[validlist, :]
    if bipolar:
        for thevoxel in range(len(validlist)):
            if lagstrengths[validlist][thevoxel] < 0.0:
                refinevoxels[thevoxel, :] *= -1.0
    refineweights = weights[validlist]
    weightsum = np.sum(refineweights, axis=0) / volumetotal
    averagedata = np.sum(refinevoxels, axis=0) / volumetotal
    if optiondict["cleanrefined"]:
        invalidlist = np.where((1 - ampmask) > 0)[0]
        discardvoxels = shiftedtcs[invalidlist]
        discardweights = weights[invalidlist]
        discardweightsum = np.sum(discardweights, axis=0) / volumetotal
        averagediscard = np.sum(discardvoxels, axis=0) / volumetotal
    if optiondict["dodispersioncalc"]:
        print("splitting regressors by time lag for phase delay estimation")
        laglist = np.arange(
            optiondict["dispersioncalc_lower"],
            optiondict["dispersioncalc_upper"],
            optiondict["dispersioncalc_step"],
        )
        dispersioncalcout = np.zeros((np.shape(laglist)[0], inputshape[1]),
                                     dtype=rt_floattype)
        fftlen = int(inputshape[1] // 2)
        fftlen -= fftlen % 2
        dispersioncalcspecmag = np.zeros((np.shape(laglist)[0], fftlen),
                                         dtype=rt_floattype)
        dispersioncalcspecphase = np.zeros((np.shape(laglist)[0], fftlen),
                                           dtype=rt_floattype)
        for lagnum in range(0, np.shape(laglist)[0]):
            lower = laglist[lagnum] - optiondict["dispersioncalc_step"] / 2.0
            upper = laglist[lagnum] + optiondict["dispersioncalc_step"] / 2.0
            inlagrange = np.where(
                locationmask * ampmask *
                np.where(lower < lagtimes, np.int16(1), np.int16(0)) *
                np.where(lagtimes < upper, np.int16(1), np.int16(0)))[0]
            print(
                "    summing",
                np.shape(inlagrange)[0],
                "regressors with lags from",
                lower,
                "to",
                upper,
            )
            if np.shape(inlagrange)[0] > 0:
                dispersioncalcout[lagnum, :] = tide_math.corrnormalize(
                    np.mean(shiftedtcs[inlagrange], axis=0),
                    detrendorder=optiondict["detrendorder"],
                    windowfunc=optiondict["windowfunc"],
                )
                (
                    freqs,
                    dispersioncalcspecmag[lagnum, :],
                    dispersioncalcspecphase[lagnum, :],
                ) = tide_math.polarfft(dispersioncalcout[lagnum, :],
                                       1.0 / fmritr)
            inlagrange = None
        tide_io.writenpvecs(
            dispersioncalcout,
            optiondict["outputname"] + "_dispersioncalcvecs_pass" +
            str(passnum) + ".txt",
        )
        tide_io.writenpvecs(
            dispersioncalcspecmag,
            optiondict["outputname"] + "_dispersioncalcspecmag_pass" +
            str(passnum) + ".txt",
        )
        tide_io.writenpvecs(
            dispersioncalcspecphase,
            optiondict["outputname"] + "_dispersioncalcspecphase_pass" +
            str(passnum) + ".txt",
        )
        tide_io.writenpvecs(
            freqs,
            optiondict["outputname"] + "_dispersioncalcfreqs_pass" +
            str(passnum) + ".txt",
        )

    if optiondict["pcacomponents"] < 0.0:
        pcacomponents = "mle"
    elif optiondict["pcacomponents"] >= 1.0:
        pcacomponents = int(np.round(optiondict["pcacomponents"]))
    elif optiondict["pcacomponents"] == 0.0:
        print("0.0 is not an allowed value for pcacomponents")
        sys.exit()
    else:
        pcacomponents = optiondict["pcacomponents"]
    icacomponents = 1

    if optiondict["refinetype"] == "ica":
        print("performing ica refinement")
        thefit = FastICA(n_components=icacomponents).fit(
            refinevoxels)  # Reconstruct signals
        print("Using first of ", len(thefit.components_), " components")
        icadata = thefit.components_[0]
        filteredavg = tide_math.corrnormalize(
            theprefilter.apply(optiondict["fmrifreq"], averagedata),
            detrendorder=optiondict["detrendorder"],
        )
        filteredica = tide_math.corrnormalize(
            theprefilter.apply(optiondict["fmrifreq"], icadata),
            detrendorder=optiondict["detrendorder"],
        )
        thepxcorr = pearsonr(filteredavg, filteredica)[0]
        print("ica/avg correlation = ", thepxcorr)
        if thepxcorr > 0.0:
            outputdata = 1.0 * icadata
        else:
            outputdata = -1.0 * icadata
    elif optiondict["refinetype"] == "pca":
        # use the method of "A novel perspective to calibrate temporal delays in cerebrovascular reactivity
        # using hypercapnic and hyperoxic respiratory challenges". NeuroImage 187, 154?165 (2019).
        print("performing pca refinement with pcacomponents set to",
              pcacomponents)
        try:
            thefit = PCA(n_components=pcacomponents).fit(refinevoxels)
        except ValueError:
            if pcacomponents == "mle":
                print(
                    "mle estimation failed - falling back to pcacomponents=0.8"
                )
                thefit = PCA(n_components=0.8).fit(refinevoxels)
            else:
                print("unhandled math exception in PCA refinement - exiting")
                sys.exit()
        print(
            "Using ",
            len(thefit.components_),
            " component(s), accounting for ",
            "{:.2f}% of the variance".format(100.0 * np.cumsum(
                thefit.explained_variance_ratio_)[len(thefit.components_) -
                                                  1]),
        )
        reduceddata = thefit.inverse_transform(thefit.transform(refinevoxels))
        if debug:
            print("complex processing: reduceddata.shape =", reduceddata.shape)
        pcadata = np.mean(reduceddata, axis=0)
        filteredavg = tide_math.corrnormalize(
            theprefilter.apply(optiondict["fmrifreq"], averagedata),
            detrendorder=optiondict["detrendorder"],
        )
        filteredpca = tide_math.corrnormalize(
            theprefilter.apply(optiondict["fmrifreq"], pcadata),
            detrendorder=optiondict["detrendorder"],
        )
        thepxcorr = pearsonr(filteredavg, filteredpca)[0]
        print("pca/avg correlation = ", thepxcorr)
        if thepxcorr > 0.0:
            outputdata = 1.0 * pcadata
        else:
            outputdata = -1.0 * pcadata
    elif optiondict["refinetype"] == "weighted_average":
        print("performing weighted averaging refinement")
        outputdata = np.nan_to_num(averagedata / weightsum)
    else:
        print("performing unweighted averaging refinement")
        outputdata = averagedata

    if optiondict["cleanrefined"]:
        thefit, R = tide_fit.mlregress(averagediscard, averagedata)
        fitcoff = rt_floatset(thefit[0, 1])
        datatoremove = rt_floatset(fitcoff * averagediscard)
        outputdata -= datatoremove
    print()
    print(
        "Timeshift applied to " + str(int(volumetotal)) + " voxels, " +
        str(len(validlist)) + " used for refinement:",
        "\n	",
        locationfails,
        " locationfails",
        "\n	",
        ampfails,
        " ampfails",
        "\n	",
        lagfails,
        " lagfails",
        "\n	",
        sigmafails,
        " sigmafails",
    )

    if optiondict["psdfilter"]:
        outputdata = tide_filt.transferfuncfilt(outputdata, snr)

    # garbage collect
    collected = gc.collect()
    print("Garbage collector: collected %d objects." % collected)

    return volumetotal, outputdata, refinemask, locationfails, ampfails, lagfails, sigmafails
def perform_feature_engineering(train, test, config):

    for c in train.columns:
        if (len(train[c].value_counts()) == 2):
            if (train[c].mean() < config['SparseThreshold']):
                del train[c]
                del test[c]

    col = list(test.columns)
    if config['ID'] != True:
        col.remove('ID')

    # tSVD
    if (config['tSVD'] == True):
        tsvd = TruncatedSVD(n_components=config['n_comp'])
        tsvd_results_train = tsvd.fit_transform(train[col])
        tsvd_results_test = tsvd.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
            test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    # PCA
    if (config['PCA'] == True):
        pca = PCA(n_components=config['n_comp'])
        pca2_results_train = pca.fit_transform(train[col])
        pca2_results_test = pca.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['pca_' + str(i)] = pca2_results_train[:, i - 1]
            test['pca_' + str(i)] = pca2_results_test[:, i - 1]
    # ICA
    if (config['ICA'] == True):
        ica = FastICA(n_components=config['n_comp'])
        ica2_results_train = ica.fit_transform(train[col])
        ica2_results_test = ica.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['ica_' + str(i)] = ica2_results_train[:, i - 1]
            test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    # GRP
    if (config['GRP'] == True):
        grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1)
        grp_results_train = grp.fit_transform(train[col])
        grp_results_test = grp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['grp_' + str(i)] = grp_results_train[:, i - 1]
            test['grp_' + str(i)] = grp_results_test[:, i - 1]

    # SRP
    if (config['SRP'] == True):
        srp = SparseRandomProjection(n_components=config['n_comp'],
                                     dense_output=True,
                                     random_state=420)
        srp_results_train = srp.fit_transform(train[col])
        srp_results_test = srp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['srp_' + str(i)] = srp_results_train[:, i - 1]
            test['srp_' + str(i)] = srp_results_test[:, i - 1]

    if config['magic'] == True:
        magic_mat = train[['ID', 'X0', 'y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({
            'X0': magic_mat.index,
            'magic': list(magic_mat)
        })
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat, on='X0', how='left')
        test = test.merge(magic_mat, on='X0', how='left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train, test
示例#32
0
    tempDF = pd.DataFrame(data=xDF.loc[:,0:1], index=xDF.index)
    tempDF = pd.concat((tempDF,yDF), axis=1, join="inner")
    tempDF.columns = ["First Vector", "Second Vector", "Label"]
    sns.lmplot(x="First Vector", y="Second Vector", hue="Label", \
               data=tempDF, fit_reg=False)
    ax = plt.gca()
    ax.set_title("Separation of Observations using "+algoName)

#----------------------------------------------------------------------------------------------------

# Independent Component Analysis
from sklearn.decomposition import FastICA

n_components = 25
algorithm = 'parallel'
whiten = True
max_iter = 100
random_state = 2018

fastICA = FastICA(n_components=n_components, algorithm=algorithm, \
                  whiten=whiten, max_iter=max_iter, random_state=random_state)

X_train_fastICA = fastICA.fit_transform(X_train)
X_train_fastICA = pd.DataFrame(data=X_train_fastICA, index=train_index)

X_validation_fastICA = fastICA.transform(X_validation)
X_validation_fastICA = pd.DataFrame(data=X_validation_fastICA, \
                                    index=validation_index)

scatterPlot(X_train_fastICA, y_train, "Independent Component Analysis")
plt.show()
示例#33
0
                        n_observations=N + Ntest,
                        n_components_in_mixture=n_components_in_mixture,
                        n_sources=n_sources,
                        n_features=n_features,
                        **cifa_param)
    for data_generating_model in data_generating_models:
        for deviation in deviations:
            for dataset in range(n_datasets):
                data, reference = sess.run(
                    [data_tf[data_generating_model], reference_tf],
                    feed_dict={placeholder_deviation: deviation})

                if initial_direction.lower() == 'ica':
                    init_directions = fica.fit(data).mixing_.T
                    kmeans_cluster_centers = kmeans.fit(
                        fica.transform(data[:N])).cluster_centers_

                elif initial_direction.lower() == 'pca':
                    init_directions = pca.fit(data).components_.T
                    kmeans_cluster_centers = kmeans.fit(pca.transform(
                        data[:N])).cluster_centers_
                else:
                    init_directions = np.random.randn(
                        n_sources, n_features).astype('float64')
                    kmeans_cluster_centers = kmeans.fit(data[:N].dot(
                        init_directions.T)).cluster_centers_

                init_directions = init_directions / np.linalg.norm(
                    init_directions, axis=1, keepdims=True)
                current_data_variance = data[:N].var()
示例#34
0
def test_fastica_simple(add_noise, seed):
    # Test the FastICA algorithm on very simple data.
    rng = np.random.RandomState(seed)
    # scipy.stats uses the global RNG:
    n_samples = 1000
    # Generate two sources:
    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
    s2 = stats.t.rvs(1, size=n_samples)
    s = np.c_[s1, s2].T
    center_and_norm(s)
    s1, s2 = s

    # Mixing angle
    phi = 0.6
    mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi),
                                                    -np.cos(phi)]])
    m = np.dot(mixing, s)

    if add_noise:
        m += 0.1 * rng.randn(2, 1000)

    center_and_norm(m)

    # function as fun arg
    def g_test(x):
        return x**3, (3 * x**2).mean(axis=-1)

    algos = ['parallel', 'deflation']
    nls = ['logcosh', 'exp', 'cube', g_test]
    whitening = [True, False]
    for algo, nl, whiten in itertools.product(algos, nls, whitening):
        if whiten:
            k_, mixing_, s_ = fastica(m.T,
                                      fun=nl,
                                      algorithm=algo,
                                      random_state=rng)
            assert_raises(ValueError,
                          fastica,
                          m.T,
                          fun=np.tanh,
                          algorithm=algo)
        else:
            pca = PCA(n_components=2, whiten=True, random_state=rng)
            X = pca.fit_transform(m.T)
            k_, mixing_, s_ = fastica(X,
                                      fun=nl,
                                      algorithm=algo,
                                      whiten=False,
                                      random_state=rng)
            assert_raises(ValueError, fastica, X, fun=np.tanh, algorithm=algo)
        s_ = s_.T
        # Check that the mixing model described in the docstring holds:
        if whiten:
            assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m))

        center_and_norm(s_)
        s1_, s2_ = s_
        # Check to see if the sources have been estimated
        # in the wrong order
        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
            s2_, s1_ = s_
        s1_ *= np.sign(np.dot(s1_, s1))
        s2_ *= np.sign(np.dot(s2_, s2))

        # Check that we have estimated the original sources
        if not add_noise:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2)
        else:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1)

    # Test FastICA class
    _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed)
    ica = FastICA(fun=nl, algorithm=algo, random_state=seed)
    sources = ica.fit_transform(m.T)
    assert_equal(ica.components_.shape, (2, 2))
    assert_equal(sources.shape, (1000, 2))

    assert_array_almost_equal(sources_fun, sources)
    assert_array_almost_equal(sources, ica.transform(m.T))

    assert_equal(ica.mixing_.shape, (2, 2))

    for fn in [np.tanh, "exp(-.5(x^2))"]:
        ica = FastICA(fun=fn, algorithm=algo)
        assert_raises(ValueError, ica.fit, m.T)

    assert_raises(TypeError, FastICA(fun=range(10)).fit, m.T)
    line2, = plt.plot(k_arr,
                      kurt_var,
                      color='b',
                      marker='o',
                      label='variance of kurtosis')
    plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.ylabel(' kurtosis')
    plt.xlabel('Number of components')
    plt.show()
    return None


kurt(X, y, 20)
ica = FastICA(n_components=11, random_state=0)
ica_2d = ica.fit_transform(X)
X_ica = ica.transform(X)
plt.scatter(ica_2d[:, 0],
            ica_2d[:, 1],
            c=y,
            cmap="RdGy",
            edgecolor="None",
            alpha=1,
            vmin=75,
            vmax=150)
plt.colorbar()
plt.title('ICA Scatter Plot')


def plot_samples(S, axis_list=None):
    plt.scatter(S[:, 0],
                S[:, 1],
示例#36
0
def DecomposedFeatures(train,
                       test,
                       total,
                       addtrain,
                       addtest,
                       n_components,
                       use_pca=0.0,
                       use_tsvd=0.0,
                       use_ica=0.0,
                       use_fa=0.0,
                       use_grp=0.0,
                       use_srp=0.0):
    N_COMP = int(n_components * train.shape[1]) + 1
    print("\nStart decomposition process...")
    train_decomposed = np.concatenate([addtrain], axis=1)
    test_decomposed = np.concatenate([addtest], axis=1)
    if use_pca > 0.0:
        print("PCA")
        N_COMP = int(use_pca * train.shape[1]) + 1
        pca = PCA(n_components=N_COMP,
                  whiten=True,
                  svd_solver="full",
                  random_state=42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        train_decomposed = np.concatenate(
            [pca_results_train, train_decomposed], axis=1)
        test_decomposed = np.concatenate([pca_results_test, test_decomposed],
                                         axis=1)

    if use_tsvd > 0.0:
        print("tSVD")
        N_COMP = int(use_tsvd * train.shape[1]) + 1
        tsvd = TruncatedSVD(n_components=N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        train_decomposed = np.concatenate(
            [tsvd_results_train, train_decomposed], axis=1)
        test_decomposed = np.concatenate([tsvd_results_test, test_decomposed],
                                         axis=1)

    if use_ica > 0.0:
        print("ICA")
        N_COMP = int(use_ica * train.shape[1]) + 1
        ica = FastICA(n_components=N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        train_decomposed = np.concatenate(
            [ica_results_train, train_decomposed], axis=1)
        test_decomposed = np.concatenate([ica_results_test, test_decomposed],
                                         axis=1)

    if use_fa > 0.0:
        print("FA")
        N_COMP = int(use_fa * train.shape[1]) + 1
        fa = FactorAnalysis(n_components=N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        train_decomposed = np.concatenate([fa_results_train, train_decomposed],
                                          axis=1)
        test_decomposed = np.concatenate([fa_results_test, test_decomposed],
                                         axis=1)

    if use_grp > 0.0:
        print("GRP")
        N_COMP = int(use_grp * train.shape[1]) + 1
        grp = GaussianRandomProjection(n_components=N_COMP,
                                       eps=0.1,
                                       random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        train_decomposed = np.concatenate(
            [grp_results_train, train_decomposed], axis=1)
        test_decomposed = np.concatenate([grp_results_test, test_decomposed],
                                         axis=1)

    if use_srp > 0.0:
        print("SRP")
        N_COMP = int(use_srp * train.shape[1]) + 1
        srp = SparseRandomProjection(n_components=N_COMP,
                                     dense_output=True,
                                     random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        train_decomposed = np.concatenate(
            [srp_results_train, train_decomposed], axis=1)
        test_decomposed = np.concatenate([srp_results_test, test_decomposed],
                                         axis=1)

    print("Append decomposition components together...")

    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(
        0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(
        0)

    return train_with_only_decomposed_features, test_with_only_decomposed_features
示例#37
0
class VAE_trainer():
    def __init__(self, dim_z=20, device="cuda"):
        # prepare cuda device
        self.device = torch.device(
            device if torch.cuda.is_available() else "cpu")
        #self.device = torch.device("cpu")
        # prepare dataset
        self.dataset = SoundDataset(transform=transforms.ToTensor(),
                                    mode='score')
        # define model
        self.model = VAE(self.dataset.data_size, dim_z).to(self.device)
        # define optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.dim_z = dim_z

    def load(self, key):
        self.dataset.load_npz('../data/sounds/raw/' + key + '.npz')
        self.dataset.normalize()

    def train(self, epoch, max_epoch):
        # train mode
        self.model.train()
        train_loss = 0
        train_loss_vae = 0
        train_loss_classifier = 0
        train_acc = 0
        for batch_idx, (x, y) in enumerate(self.train_loader):
            x, y = x.to(self.device), y.to(self.device)
            # zero the parameter gradients
            self.optimizer.zero_grad()

            # forward
            rec_x, pre_y, mu, logvar = self.model(x)
            loss_vae = self.model.loss_function_vae(rec_x, x, mu, logvar)
            loss_classifier = self.model.loss_function_classifier(pre_y, y)
            loss = loss_vae + loss_classifier
            # backward
            loss.backward()
            # update the parameter
            self.optimizer.step()
            # logging
            train_loss += loss.item()
            train_loss_vae += loss_vae.item()
            train_loss_classifier += loss_classifier.item()
            train_acc += self.model.acc(pre_y, y)
            if batch_idx % 20 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(x), len(self.train_loader.dataset),
                    100. * batch_idx / len(self.train_loader),
                    loss.item() / len(x)))

        train_loss /= len(self.train_loader.dataset)
        train_loss_vae /= len(self.train_loader.dataset)
        train_loss_classifier /= len(self.train_loader.dataset)
        train_acc /= len(self.train_loader.dataset)
        print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss))

        return train_loss, train_loss_vae, train_loss_classifier, train_acc

    def valid(self, epoch):
        # test mode
        self.model.eval()
        valid_loss = 0
        valid_loss_vae = 0
        valid_loss_classifier = 0
        valid_acc = 0
        # test mode
        with torch.no_grad():
            for i, (x, y) in enumerate(self.valid_loader):
                x, y = x.to(self.device), y.to(self.device)
                rec_x, pre_y, mu, logvar = self.model.valid(x)
                loss_vae = self.model.loss_function_vae(rec_x, x, mu, logvar)
                loss_classifier = self.model.loss_function_classifier(pre_y, y)
                loss = loss_vae + loss_classifier
                valid_loss += loss.item()
                valid_loss_vae += loss_vae.item()
                valid_loss_classifier += loss_classifier.item()
                valid_acc += self.model.acc(pre_y, y)

        valid_loss /= len(self.valid_loader.dataset)
        valid_loss_vae /= len(self.valid_loader.dataset)
        valid_loss_classifier /= len(self.valid_loader.dataset)
        valid_acc /= len(self.valid_loader.dataset)
        print('====> Validation set loss: {:.4f}'.format(valid_loss))

        return valid_loss, valid_loss_vae, valid_loss_classifier, valid_acc

    def auto_train(self, max_epoch, save_path=None):
        train_set, valid_set = torch.utils.data.random_split(
            self.dataset, [
                int(len(self.dataset) * 0.8),
                len(self.dataset) - int(len(self.dataset) * 0.8)
            ])
        self.train_loader = torch.utils.data.DataLoader(train_set,
                                                        batch_size=10,
                                                        shuffle=True)
        self.valid_loader = torch.utils.data.DataLoader(valid_set,
                                                        batch_size=10,
                                                        shuffle=True)

        train_loss = []
        train_loss_vae = []
        train_loss_classifier = []
        train_acc = []
        valid_loss = []
        valid_loss_vae = []
        valid_loss_classifier = []
        valid_acc = []
        for epoch in range(1, max_epoch):
            t_loss, t_loss_vae, t_loss_classifier, t_acc = self.train(
                epoch, max_epoch)
            v_loss, v_loss_vae, v_loss_classifier, v_acc = self.valid(epoch)
            train_loss.append(t_loss)
            train_loss_vae.append(t_loss_vae)
            train_loss_classifier.append(t_loss_classifier)
            train_acc.append(t_acc)
            valid_loss.append(v_loss)
            valid_loss_vae.append(v_loss_vae)
            valid_loss_classifier.append(v_loss_classifier)
            valid_acc.append(v_acc)
        # plot result
        if save_path is not None:
            fig, ax = plt.subplots(4, 1, figsize=(8, 16))
            ax[0].set_title('Loss')
            ax[1].set_title('VAE Loss')
            ax[2].set_title('Classifier Loss')
            ax[3].set_title('Accuracy')
            for i in range(3):
                ax[i].set_xlabel('Epochs')
                ax[i].set_ylabel('Loss')
            ax[0].plot(range(1, max_epoch), train_loss, label="train")
            ax[0].plot(range(1, max_epoch), valid_loss, label="validation")
            ax[1].plot(range(1, max_epoch), train_loss_vae, label="train")
            ax[1].plot(range(1, max_epoch), valid_loss_vae, label="validation")
            ax[2].plot(range(1, max_epoch),
                       train_loss_classifier,
                       label="train")
            ax[2].plot(range(1, max_epoch),
                       valid_loss_classifier,
                       label="validation")
            ax[3].set_xlabel('Epochs')
            ax[3].set_ylabel('Accuracy')
            ax[3].plot(range(1, max_epoch), train_acc, label="train")
            ax[3].plot(range(1, max_epoch), valid_acc, label="validation")
            for i in range(3):
                ax[i].legend()
            plt.tight_layout()
            plt.savefig(save_path + '/loss.png')
            plt.close()

    def save_weight(self, save_path='../result/VAE-score/model/vae'):
        torch.save(self.model.state_dict(), save_path)

    def load_weight(self, load_path='../result/VAE-score/model/vae'):
        self.model.load_state_dict(torch.load(load_path))

    def plot_z(self, save_path='../result/VAE-score/model/result.png'):
        # print z all data
        loader = torch.utils.data.DataLoader(self.dataset,
                                             batch_size=len(self.dataset),
                                             shuffle=False)
        all_z = []
        all_ans = []
        self.model.eval()
        with torch.no_grad():
            for i, (data, ans) in enumerate(loader):
                data = data.to(self.device)
                _, _, mu, logvar = self.model.forward(data)
                all_z = np.append(all_z, mu.to('cpu').clone().numpy())

        all_z = np.array(all_z).reshape(-1, self.model.z_shape)
        all_ans = self.dataset.ans

        # LDA
        #self.lda = LDA(n_components = 2)
        #self.lda.fit(all_z, all_ans)
        #lda_z = self.lda.transform(all_z)
        #lda_z = lda_z.transpose()

        #z_xrange = [np.min(lda_z[0]), np.max(lda_z[0])]
        #z_yrange = [np.min(lda_z[1]), np.max(lda_z[1])]
        #plot_z(lda_z[0], lda_z[1], all_ans, "z map", save_path.split('.png')[0] + '_LDA.png', z_xrange, z_yrange)
        #plot_z_each(lda_z, all_ans, self.dataset.filenames, '../data/succeed_list_sound.csv', "z map",
        #           save_path.split('.png')[0] + '_LDA_each.png', z_xrange, z_yrange)

        # ICA
        self.ica = FastICA(n_components=2)
        self.ica.fit(all_z)
        ica_z = self.ica.transform(all_z)
        ica_z = ica_z.transpose()

        z_xrange = [np.min(ica_z[0]), np.max(ica_z[0])]
        z_yrange = [np.min(ica_z[1]), np.max(ica_z[1])]
        plot_z(ica_z[0], ica_z[1], all_ans, "z map",
               save_path.split('.png')[0] + '_ICA.png', z_xrange, z_yrange)
        plot_z_each(ica_z, all_ans, self.dataset.filenames,
                    '../data/succeed_list_sound.csv', "z map",
                    save_path.split('.png')[0] + '_ICA_each.png', z_xrange,
                    z_yrange)
        return all_z, all_ans, ica_z.transpose()

    def reconstruct(self,
                    save_path='../result/VAE-score/reconstructed_sounds'):
        loader = torch.utils.data.DataLoader(self.dataset,
                                             batch_size=1,
                                             shuffle=False)
        self.model.eval()
        with torch.no_grad():
            for i, (x, y) in enumerate(loader):
                x = x.to(self.device)
                recon_x, _, _, _ = self.model.forward(x)
                recon_x = recon_x.to('cpu').clone().numpy()
                x = x.to('cpu').clone().numpy()
                x = x.reshape(3, -1)
                recon_x = recon_x.reshape(3, -1)
                # to png
                fig, ax = plt.subplots(2, 3, figsize=(24, 12))
                ax[0][0].set_title('L')
                ax[0][1].set_title('C')
                ax[0][2].set_title('R')
                ax[1][0].set_title('reconstructed L')
                ax[1][1].set_title('reconstructed C')
                ax[1][2].set_title('reconstructed R')
                time = range(len(x[0]))
                for j in range(3):
                    ax[0][j].set_ylim(0, 1)
                    ax[1][j].set_ylim(0, 1)
                    ax[0][j].plot(time, x[j], linewidth=1)
                    ax[1][j].plot(time, recon_x[j], linewidth=1)
                plt.tight_layout()
                plt.savefig(save_path + '/' +
                            self.dataset.filenames[i].split('.csv')[0] +
                            '.png')
                plt.close()
                # to csv
                save_data = pd.DataFrame(data=recon_x)
                save_data.to_csv(save_path + '/' + self.dataset.filenames[i],
                                 index=False)
示例#38
0
class ICA(object):
    """M/EEG signal decomposition using Independent Component Analysis (ICA)

    This object can be used to estimate ICA components and then
    remove some from Raw or Epochs for data exploration or artifact
    correction.

    Parameters
    ----------
    n_components : int | float | None
        The number of components used for ICA decomposition. If int, it must be
        smaller then max_n_components. If None, all PCA components will be
        used. If float between 0 and 1 components can will be selected by the
        cumulative percentage of explained variance.
    max_n_components : int | None
        The number of components used for PCA decomposition. If None, no
        dimension reduction will be applied and max_n_components will equal
        the number of channels supplied on decomposing data.
    noise_cov : None | instance of mne.cov.Covariance
        Noise covariance used for whitening. If None, channels are just
        z-scored.
    random_state : None | int | instance of np.random.RandomState
        np.random.RandomState to initialize the FastICA estimation.
        As the estimation is non-deterministic it can be useful to
        fix the seed to have reproducible results.
    algorithm : {'parallel', 'deflation'}
        Apply parallel or deflational algorithm for FastICA
    fun : string or function, optional. Default: 'logcosh'
        The functional form of the G function used in the
        approximation to neg-entropy. Could be either 'logcosh', 'exp',
        or 'cube'.
        You can also provide your own function. It should return a tuple
        containing the value of the function, and of its derivative, in the
        point.
    fun_args: dictionary, optional
        Arguments to send to the functional form.
        If empty and if fun='logcosh', fun_args will take value
        {'alpha' : 1.0}
    verbose : bool, str, int, or None
        If not None, override default verbose level (see mne.verbose).

    Attributes
    ----------
    last_fit : str
        Flag informing about which type was last fit.
    ch_names : list-like
        Channel names resulting from initial picking.
    n_components : int
        The number of components used for ICA decomposition.
    max_n_components : int
        The number of PCA dimensions computed.
    verbose : bool, str, int, or None
        See above.
    """
    @verbose
    def __init__(self, n_components, max_n_components=100, noise_cov=None,
                 random_state=None, algorithm='parallel', fun='logcosh',
                 fun_args=None, verbose=None):
        try:
            from sklearn.decomposition import FastICA  # to avoid strong dep.
        except ImportError:
            raise Exception('the scikit-learn package is missing and '
                            'required for ICA')
        self.noise_cov = noise_cov

        # sklearn < 0.11 does not support random_state argument for FastICA
        kwargs = {'algorithm': algorithm, 'fun': fun, 'fun_args': fun_args}

        if random_state is not None:
            aspec = inspect.getargspec(FastICA.__init__)
            if 'random_state' not in aspec.args:
                warnings.warn('random_state argument ignored, update '
                              'scikit-learn to version 0.11 or newer')
            else:
                kwargs['random_state'] = random_state

        if max_n_components is not None and n_components > max_n_components:
            raise ValueError('n_components must be smaller than '
                             'max_n_components')

        if isinstance(n_components, float):
            if not 0 < n_components <= 1:
                raise ValueError('For selecting ICA components by the '
                                 'explained variance of PCA components the'
                                 ' float value must be between 0.0 and 1.0 ')
            self._explained_var = n_components
            logger.info('Selecting pca_components via explained variance.')
        else:
            self._explained_var = 1.1
            logger.info('Selecting pca_components directly.')

        self._ica = FastICA(**kwargs)
        self.current_fit = 'unfitted'
        self.verbose = verbose
        self.n_components = n_components
        self.max_n_components = max_n_components
        self.ch_names = None
        self._mixing = None

    def __repr__(self):
        s = 'ICA '
        if self.current_fit == 'unfitted':
            msg = '(no'
        elif self.current_fit == 'raw':
            msg = '(raw data'
        else:
            msg = '(epochs'
        msg += ' decomposition, '

        s += msg + ('%s components' % str(self.n_components) if
               self.n_components else 'no dimension reduction') + ')'

        return s

    @verbose
    def decompose_raw(self, raw, picks=None, start=None, stop=None,
                      verbose=None):
        """Run the ICA decomposition on raw data

        Parameters
        ----------
        raw : instance of mne.fiff.Raw
            Raw measurements to be decomposed.
        picks : array-like
            Channels to be included. This selection remains throughout the
            initialized ICA session. If None only good data channels are used.
        start : int
            First sample to include (first is 0). If omitted, defaults to the
            first sample in data.
        stop : int
            First sample to not include. If omitted, data is included to the
            end.
        verbose : bool, str, int, or None
            If not None, override default verbose level (see mne.verbose).
            Defaults to self.verbose.

        Returns
        -------
        self : instance of ICA
            Returns the modified instance.
        """
        if self.current_fit != 'unfitted':
            raise RuntimeError('ICA decomposition has already been fitted. '
                               'Please start a new ICA session.')

        logger.info('Computing signal decomposition on raw data. '
                    'Please be patient, this may take some time')

        if picks is None:  # just use good data channels
            picks = pick_types(raw.info, meg=True, eeg=True, eog=False,
                               ecg=False, misc=False, stim=False,
                               exclude=raw.info['bads'])

        if self.max_n_components is None:
            self.max_n_components = len(picks)
            logger.info('Inferring max_n_components from picks.')

        self.ch_names = [raw.ch_names[k] for k in picks]

        data, self._pre_whitener = self._pre_whiten(raw[picks, start:stop][0],
                                                   raw.info, picks)

        to_ica, self._pca = self._prepare_pca(data, self.max_n_components)

        self._ica.fit(to_ica)
        self._mixing = self._ica.get_mixing_matrix().T
        self.current_fit = 'raw'

        return self

    @verbose
    def decompose_epochs(self, epochs, picks=None, verbose=None):
        """Run the ICA decomposition on epochs

        Parameters
        ----------
        epochs : instance of Epochs
            The epochs. The ICA is estimated on the concatenated epochs.
        picks : array-like
            Channels to be included relative to the channels already picked on
            epochs-initialization. This selection remains throughout the
            initialized ICA session.
        verbose : bool, str, int, or None
            If not None, override default verbose level (see mne.verbose).
            Defaults to self.verbose.

        Returns
        -------
        self : instance of ICA
            Returns the modified instance.
        """
        if self.current_fit != 'unfitted':
            raise RuntimeError('ICA decomposition has already been fitted. '
                               'Please start a new ICA session.')

        logger.info('Computing signal decomposition on epochs. '
                    'Please be patient, this may take some time')

        if picks is None:  # just use epochs good data channels and avoid
            picks = pick_types(epochs.info, include=epochs.ch_names,  # double
                               exclude=epochs.info['bads'])  # picking

        meeg_picks = pick_types(epochs.info, meg=True, eeg=True, eog=False,
                                ecg=False, misc=False, stim=False,
                                exclude=epochs.info['bads'])

        # filter out all the channels the raw wouldn't have initialized
        picks = np.intersect1d(meeg_picks, picks)

        self.ch_names = [epochs.ch_names[k] for k in picks]

        if self.max_n_components is None:
            self.max_n_components = len(picks)
            logger.info('Inferring max_n_components from picks.')

        data, self._pre_whitener = self._pre_whiten(
                                np.hstack(epochs.get_data()[:, picks]),
                                epochs.info, picks)

        to_ica, self._pca = self._prepare_pca(data, self.max_n_components)

        self._ica.fit(to_ica)
        self._mixing = self._ica.get_mixing_matrix().T
        self.current_fit = 'epochs'

        return self

    def get_sources_raw(self, raw, start=None, stop=None):
        """Estimate raw sources given the unmixing matrix

        Parameters
        ----------
        raw : instance of Raw
            Raw object to draw sources from.
        start : int
            First sample to include (first is 0). If omitted, defaults to the
            first sample in data.
        stop : int
            First sample to not include.
            If omitted, data is included to the end.

        Returns
        -------
        sources : array, shape = (n_components, n_times)
            The ICA sources time series.
        """
        if self._mixing is None:
            raise RuntimeError('No fit available. Please first fit ICA '
                               'decomposition.')

        return self._get_sources_raw(raw, start, stop)[0]

    def _get_sources_raw(self, raw, start, stop):
        picks = [raw.ch_names.index(k) for k in self.ch_names]
        data, _ = self._pre_whiten(raw[picks, start:stop][0], raw.info, picks)
        pca_data = self._pca.transform(data.T)
        raw_sources = self._ica.transform(pca_data[:, self._comp_idx]).T

        return raw_sources, pca_data

    def get_sources_epochs(self, epochs, concatenate=False):
        """Estimate epochs sources given the unmixing matrix

        Parameters
        ----------
        epochs : instance of Epochs
            Epochs object to draw sources from.
        concatenate : bool
            If true, epochs and time slices will be concatenated.

        Returns
        -------
        epochs_sources : ndarray of shape (n_epochs, n_sources, n_times)
            The sources for each epoch
        """
        if self._mixing is None:
            raise RuntimeError('No fit available. Please first fit ICA '
                               'decomposition.')

        return self._get_sources_epochs(epochs, concatenate)[0]

    def _get_sources_epochs(self, epochs, concatenate):

        picks = pick_types(epochs.info, include=self.ch_names,
                               exclude=epochs.info['bads'])

        # special case where epochs come picked but fit was 'unpicked'.
        if len(picks) != len(self.ch_names):
            raise RuntimeError('Epochs don\'t match fitted data: %i channels '
                               'fitted but %i channels supplied. \nPlease '
                               'provide Epochs compatible with '
                               'ica.ch_names' % (len(self.ch_names),
                                                  len(picks)))

        data, _ = self._pre_whiten(np.hstack(epochs.get_data()[:, picks]),
                                   epochs.info, picks)

        pca_data = self._pca.transform(data.T)
        sources = self._ica.transform(pca_data[:, self._comp_idx]).T
        sources = np.array(np.split(sources, len(epochs.events), 1))

        if concatenate:
            sources = np.hstack(sources)

        return sources, pca_data

    def export_sources(self, raw, picks=None, start=None, stop=None):
        """Export sources as raw object

        Parameters
        ----------
        raw : instance of Raw
            Raw object to export sources from.
        picks : array-like
            Channels to be included in addition to the sources. If None,
            artifact and stimulus channels will be included.
        start : int
            First sample to include (first is 0). If omitted, defaults to the
            first sample in data.
        stop : int
            First sample to not include. If omitted, data is included to the
            end.

        Returns
        -------
        out : instance of mne.Raw
            Container object for ICA sources

        """
        if not raw._preloaded:
            raise ValueError('raw data should be preloaded to have this '
                             'working. Please read raw data with '
                             'preload=True.')

        # include 'reference' channels for comparison with ICA
        if picks is None:
            picks = pick_types(raw.info, meg=False, eeg=False, misc=True,
                               ecg=True, eog=True, stim=True)

        # merge copied instance and picked data with sources
        out = raw.copy()
        out.fids = []
        sources = self.get_sources_raw(raw, start=start, stop=stop)
        out._data = np.r_[sources, raw[picks, start:stop][0]]

        # update first and last samples
        out.first_samp = raw.first_samp + (start if start else 0)
        out.last_samp = out.first_samp + stop if stop else raw.last_samp

        # set channel names and info
        ch_names = out.info['ch_names'] = []
        ch_info = out.info['chs'] = []
        for i in xrange(self.n_components):
            ch_names.append('ICA %03d' % (i + 1))
            ch_info.append(dict(ch_name='ICA %03d' % (i + 1), cal=1,
                logno=i + 1, coil_type=FIFF.FIFFV_COIL_NONE,
                kind=FIFF.FIFFV_MISC_CH, coord_Frame=FIFF.FIFFV_COORD_UNKNOWN,
                loc=np.array([0.,  0.,  0.,  1., 0.,  0.,  0.,  1.,
                              0.,  0.,  0.,  1.], dtype=np.float32),
                unit=FIFF.FIFF_UNIT_NONE, eeg_loc=None, range=1.0,
                scanno=i + 1, unit_mul=0, coil_trans=None))

        # re-append additionally picked ch_names
        ch_names += [raw.ch_names[k] for k in picks]
        # re-append additionally picked ch_info
        ch_info += [raw.info['chs'][k] for k in picks]

        # update number of channels
        out.info['nchan'] = len(picks) + self.n_components

        return out

    def plot_sources_raw(self, raw, order=None, start=None, stop=None,
                         n_components=None, source_idx=None, ncol=3, nrow=10,
                         show=True):
        """Create panel plots of ICA sources. Wrapper around viz.plot_ica_panel

        Parameters
        ----------
        raw : instance of mne.fiff.Raw
            Raw object to plot the sources from.
        order : ndarray | None.
            Index of length n_components. If None, plot will show the sources
            in the order as fitted.
            Example: arg_sort = np.argsort(np.var(sources)).
        start : int
            X-axis start index. If None from the beginning.
        stop : int
            X-axis stop index. If None to the end.
        n_components : int
            Number of components fitted.
        source_idx : array-like
            Indices for subsetting the sources.
        ncol : int
            Number of panel-columns.
        nrow : int
            Number of panel-rows.
        show : bool
            If True, plot will be shown, else just the figure is returned.

        Returns
        -------
        fig : instance of pyplot.Figure
        """

        sources = self.get_sources_raw(raw, start=start, stop=stop)

        if order is not None:
            if len(order) != sources.shape[0]:
                    raise ValueError('order and sources have to be of the '
                                     'same length.')
            else:
                sources = sources[order]

        fig = plot_ica_panel(sources, start=0 if start is not None else start,
                             stop=(stop - start) if stop is not None else stop,
                             n_components=n_components, source_idx=source_idx,
                             ncol=ncol, nrow=nrow)
        if show:
            import matplotlib.pylab as pl
            pl.show()

        return fig

    def plot_sources_epochs(self, epochs, epoch_idx=None, order=None,
                            start=None, stop=None, n_components=None,
                            source_idx=None, ncol=3, nrow=10, show=True):
        """Create panel plots of ICA sources. Wrapper around viz.plot_ica_panel

        Parameters
        ----------
        epochs : instance of mne.Epochs
            Epochs object to plot the sources from.
        epoch_idx : int
            Index to plot particular epoch.
        order : ndarray | None.
            Index of length n_components. If None, plot will show the sources
            in the order as fitted.
            Example: arg_sort = np.argsort(np.var(sources)).
        sources : ndarray
            Sources as drawn from self.get_sources.
        start : int
            X-axis start index. If None from the beginning.
        stop : int
            X-axis stop index. If None to the end.
        n_components : int
            Number of components fitted.
        source_idx : array-like
            Indices for subsetting the sources.
        ncol : int
            Number of panel-columns.
        nrow : int
            Number of panel-rows.
        show : bool
            If True, plot will be shown, else just the figure is returned.

        Returns
        -------
        fig : instance of pyplot.Figure
        """
        sources = self.get_sources_epochs(epochs, concatenate=True if epoch_idx
                                          is None else False)
        source_dim = 1 if sources.ndim > 2 else 0
        if order is not None:
            if len(order) != sources.shape[source_dim]:
                raise ValueError('order and sources have to be of the '
                                 'same length.')
            else:
                sources = (sources[:, order] if source_dim
                           else sources[order])

        fig = plot_ica_panel(sources[epoch_idx], start=start, stop=stop,
                             n_components=n_components, source_idx=source_idx,
                             ncol=ncol, nrow=nrow)
        if show:
            import matplotlib.pylab as pl
            pl.show()

        return fig

    def find_sources_raw(self, raw, target=None, score_func='pearsonr',
                         start=None, stop=None):
        """Find sources based on own distribution or based on similarity to
        other sources or between source and target.

        Parameters
        ----------
        raw : instance of Raw
            Raw object to draw sources from.
        target : array-like | ch_name | None
            Signal to which the sources shall be compared. It has to be of
            the same shape as the sources. If some string is supplied, a
            routine will try to find a matching channel. If None, a score
            function expecting only one input-array argument must be used,
            for instance, scipy.stats.skew (default).
        score_func : callable | str label
            Callable taking as arguments either two input arrays
            (e.g. pearson correlation) or one input
            array (e. g. skewness) and returns a float. For convenience the
            most common score_funcs are available via string labels: Currently,
            all distance metrics from scipy.spatial and all functions from
            scipy.stats taking compatible input arguments are supported. These
            function have been modified to support iteration over the rows of a
            2D array.
        start : int
            First sample to include (first is 0). If omitted, defaults to the
            first sample in data.
        stop : int
            First sample to not include.
            If omitted, data is included to the end.
        scores : ndarray
            Scores for each source as returned from score_func.

        Returns
        -------
        scores : ndarray
            scores for each source as returned from score_func
        """
        # auto source drawing
        sources = self.get_sources_raw(raw=raw, start=start, stop=stop)

        # auto target selection
        if target is not None:
            if hasattr(target, 'ndim'):
                if target.ndim < 2:
                    target = target.reshape(1, target.shape[-1])
            if isinstance(target, str):
                pick = _get_target_ch(raw, target)
                target, _ = raw[pick, start:stop]
            if sources.shape[1] != target.shape[1]:
                raise ValueError('Source and targets do not have the same'
                                 'number of time slices.')
            target = target.ravel()

        return _find_sources(sources, target, score_func)

    def find_sources_epochs(self, epochs, target=None, score_func='pearsonr'):
        """Find sources based on relations between source and target

        Parameters
        ----------
        epochs : instance of Epochs
            Epochs object to draw sources from.
        target : array-like | ch_name | None
            Signal to which the sources shall be compared. It has to be of
            the same shape as the sources. If some string is supplied, a
            routine will try to find a matching channel. If None, a score
            function expecting only one input-array argument must be used,
            for instance, scipy.stats.skew (default).
        score_func : callable | str label
            Callable taking as arguments either two input arrays
            (e.g. pearson correlation) or one input
            array (e. g. skewness) and returns a float. For convenience the
            most common score_funcs are available via string labels: Currently,
            all distance metrics from scipy.spatial and all functions from
            scipy.stats taking compatible input arguments are supported. These
            function have been modified to support iteration over the rows of a
            2D array.

        Returns
        -------
        scores : ndarray
            scores for each source as returned from score_func
        """
        sources = self.get_sources_epochs(epochs=epochs)
        # auto target selection
        if target is not None:
            if hasattr(target, 'ndim'):
                if target.ndim < 3:
                    target = target.reshape(1, 1, target.shape[-1])
            if isinstance(target, str):
                pick = _get_target_ch(epochs, target)
                target = epochs.get_data()[:, pick]
            if sources.shape[2] != target.shape[2]:
                raise ValueError('Source and targets do not have the same'
                                 'number of time slices.')
            target = target.ravel()

        return _find_sources(np.hstack(sources), target, score_func)

    def pick_sources_raw(self, raw, include=None, exclude=None,
                         n_pca_components=64, start=None, stop=None,
                         copy=True):
        """Recompose raw data including or excluding some sources

        Parameters
        ----------
        raw : instance of Raw
            Raw object to pick to remove ICA components from.
        include : list-like | None
            The source indices to use. If None all are used.
        exclude : list-like | None
            The source indices to remove. If None  all are used.
        n_pca_components:
            The number of PCA components to be unwhitened, where n_components
            is the lower bound and max_n_components the upper bound.
            If greater than self.n_components, the PCA components that were not
            supplied to the ICA will get re-attached. This can be used to take
            back the PCA dimension reduction.
        start : int | None
            The first time index to include.
        stop : int | None
            The first time index to exclude.
        copy: bool
            modify raw instance in place or return modified copy.

        Returns
        -------
        raw : instance of Raw
            raw instance with selected ICA components removed
        """
        if not raw._preloaded:
            raise ValueError('raw data should be preloaded to have this '
                             'working. Please read raw data with '
                             'preload=True.')

        if self.current_fit != 'raw':
            raise ValueError('Currently no raw data fitted.'
                             'Please fit raw data first.')

        sources, pca_data = self._get_sources_raw(raw, start=start, stop=stop)
        recomposed = self._pick_sources(sources, pca_data, include, exclude,
                                        n_pca_components)

        if copy is True:
            raw = raw.copy()

        picks = [raw.ch_names.index(k) for k in self.ch_names]
        raw[picks, start:stop] = recomposed
        return raw

    def pick_sources_epochs(self, epochs, include=None, exclude=None,
                            n_pca_components=64, copy=True):
        """Recompose epochs

        Parameters
        ----------
        epochs : instance of Epochs
            Epochs object to pick to remove ICA components from.
        include : list-like | None
            The source indices to use. If None all are used.
        exclude : list-like | None
            The source indices to remove. If None  all are used.
        n_pca_components:
            The number of PCA components to be unwhitened, where n_components
            is the lower bound and max_n_components the upper bound.
            If greater than self.n_components, the PCA components that were not
            supplied to the ICA will get re-attached. This can be used to take
            back the PCA dimension reduction.
        copy : bool
            Modify Epochs instance in place or return modified copy.

        Returns
        -------
        epochs : instance of Epochs
            Epochs with selected ICA components removed.
        """
        if not epochs.preload:
            raise ValueError('raw data should be preloaded to have this '
                             'working. Please read raw data with '
                             'preload=True.')

        sources, pca_data = self._get_sources_epochs(epochs, True)
        picks = pick_types(epochs.info, include=self.ch_names,
                               exclude=epochs.info['bads'])

        if copy is True:
            epochs = epochs.copy()

        # put sources-dimension first for selection
        recomposed = self._pick_sources(sources, pca_data, include, exclude,
                                        n_pca_components)
        # restore epochs, channels, tsl order
        epochs._data[:, picks] = np.array(np.split(recomposed,
                                          len(epochs.events), 1))
        epochs.preload = True

        return epochs

    def _pre_whiten(self, data, info, picks):
        """Helper function"""
        if self.noise_cov is None:  # use standardization as whitener
            pre_whitener = np.std(data) ** -1
            data *= pre_whitener
        else:  # pick cov
            ncov = deepcopy(self.noise_cov)
            if ncov.ch_names != self.ch_names:
                ncov['data'] = ncov.data[picks][:, picks]
            assert data.shape[0] == ncov.data.shape[0]
            pre_whitener, _ = compute_whitener(ncov, info, picks)
            data = np.dot(pre_whitener, data)

        return data, pre_whitener

    def _prepare_pca(self, data, max_n_components):
        """ Helper Function """
        from sklearn.decomposition import RandomizedPCA

        # sklearn < 0.11 does not support random_state argument
        kwargs = {'n_components': max_n_components, 'whiten': False}

        aspec = inspect.getargspec(RandomizedPCA.__init__)
        if 'random_state' not in aspec.args:
            warnings.warn('RandomizedPCA does not support random_state '
                          'argument. Use scikit-learn to version 0.11 '
                          'or newer to get reproducible results.')
        else:
            kwargs['random_state'] = 0

        pca = RandomizedPCA(**kwargs)
        pca_data = pca.fit_transform(data.T)

        if self._explained_var > 1.0:
            if self.n_components is not None:  # normal n case
                self._comp_idx = np.arange(self.n_components)
                to_ica = pca_data[:, self._comp_idx]
            else:  # None case
                to_ica = pca_data
                self.n_components = pca_data.shape[1]
                self._comp_idx = np.arange(self.n_components)
        else:  # float case
            expl_var = pca.explained_variance_ratio_
            self._comp_idx = (np.where(expl_var.cumsum() <
                                      self._explained_var)[0])
            to_ica = pca_data[:, self._comp_idx]
            self.n_components = len(self._comp_idx)

        return to_ica, pca

    def _pick_sources(self, sources, pca_data, include, exclude,
                      n_pca_components):
        """Helper function"""
        if not(self.n_components <= n_pca_components <= self.max_n_components):
            raise ValueError('n_pca_components must be between n_components'
                             ' and max_n_components.')

        if include not in (None, []):
            mute = [i for i in xrange(len(sources)) if i not in include]
            sources[mute, :] = 0.  # include via exclusion
        elif exclude not in (None, []):
            sources[exclude, :] = 0.  # just exclude

        # restore pca data
        mixing = self._mixing.copy()
        pca_restored = np.dot(sources.T, mixing)

        # re-append deselected pca dimension if desired
        if n_pca_components - self.n_components > 0:
            add_components = np.arange(self.n_components, n_pca_components)
            pca_reappend = pca_data[:, add_components]
            pca_restored = np.c_[pca_restored, pca_reappend]

        # restore sensor space data
        out = _inverse_t_pca(pca_restored, self._pca)

        # restore scaling
        pre_whitener = self._pre_whitener.copy()
        if self.noise_cov is None:  # revert standardization
            pre_whitener **= -1
            out *= pre_whitener
        else:
            out = np.dot(out, linalg.pinv(pre_whitener))

        return out.T
示例#39
0
    plt.legend([b[0] for b in bars], cv_types)

    plt.show()


X = load_wine().data
y = load_wine().target

X = np.array(X, dtype=np.float32)
scaler = StandardScaler()
scaler.fit(X)

X = scaler.transform(X)

ica = ICA(n_components=4)
ica.fit(X)

X = ica.transform(X)

plot_bic(X)

gmm = mixture.GaussianMixture(n_components=2, covariance_type='tied')
gmm.fit(X)

gaussianGroups = [[], []]

for pt in X:
    res = gmm.predict(pt.reshape(1, -1))[0]
    gaussianGroups[res].append(pt)

print gaussianGroups[1]
ax1.set_ylabel('Mean Cross Validation Accuracy')
ax1.axvline(gridSearch.best_estimator_.named_steps['ica'].n_components,
            linestyle=':',
            label='n_components chosen',
            linewidth=2)

plt.legend(prop=dict(size=12))
plt.title('Accuracy/kurtosis for ICA (best n_components=  %d)' %
          gridSearch.best_estimator_.named_steps['ica'].n_components)
plt.show()

#Reducing the dimensions with optimal number of components
ica_new = FastICA(
    n_components=gridSearch.best_estimator_.named_steps['ica'].n_components)
ica_new.fit(X_train)
X_train_transformed = ica_new.transform(X_train)
X_test_transformed = ica_new.transform(X_test)

###############################################################################################################################
#Reconstruction Error

print("Calculating Reconstruction Error")

reconstruction_error = []
for comp in n_components:

    ica = FastICA(n_components=comp)
    X_transformed = ica.fit_transform(X_train)
    X_projected = ica.inverse_transform(X_transformed)
    reconstruction_error.append(((X_train - X_projected)**2).mean())
示例#41
0
def extract_staining (img, mask, expected, white_threshold=.15, verbose=False):
	""" 
		Extract spatial distribution of the stain and pigment in the image
		
		Parameters
		----------
			
		img: array [height, width, 3]
			The RGB image to be decomposed. Range of values 0 - 255
		
		mask: array [height, width]
			The mask showing the embryo location. 
			
		expected: [(array [3], float), (array [3], float), (array [3], float)]
			List of pairs (RGB colour, confidence) for expected stain, 
			pigment and background colour respectively. If any of the 
			colours	is not expected to be present corresponding RGB value 
			should be set to white ([255, 255, 255])
			
		white_threshold: float, optional, default: .15
			Maximal length of a colour vector in CMY unit cube, which is 
			still considered white.
			
		verbose: Bool/str, optional, default: False
			Verbosity level:
			- False: silent run
			- True: report textual information
			- Path prefix: report textual information and report graphical
				information in verbose+<suffix>.jpg files
				
		
		Returns
		-------
		
		strain: array [height, width]
			Spatial distribution of the stain 
		
		confidence: float
			Confidence level that the exp[ected stain is actually present 
			in the image [0..1]
			
		pigment: array [height, width]
			Spatial distribution of the pigment 

		colours: array [3, 3]
			RGB colours estimated from the image. If a colour is not
			present it is set to white
	"""
	
	img_ = img.copy()
	img = 1 - img/255.
	c = 1 - np.vstack([colour for colour, p in expected])/255.
	
	c_save = c.copy()

	norms = np.array([la.norm(c[0], 2), la.norm(c[1], 2), la.norm(c[2], 2)])
	
	_, d, _ = la.svd([clr/n for clr, n in zip(c, norms) if n > 1.e-2])
	if d[-1]/d[0] < .05 and c.shape[0] > 1:
		dist = []
		for i in range(1, c.shape[0]):
			comb = (0, i)
			if np.all(norms[list(comb)] > 0.01):
				dist += [(comb, la.norm(c[comb[0]]/norms[comb[0]] - c[comb[1]]/norms[comb[1]] ,2))]
		farthest = list(max(dist, key= lambda x: x[1])[0])
		norms_ = np.zeros_like(norms)
		norms_[farthest] = norms[farthest]
		norms = norms_
	
	c = np.array([clr/n for clr, n in zip(c, norms) if n > 1.e-2])
	
			
	prior = np.array([p for colour, p in expected])[norms > 1.e-2]

	X = img[(mask == 1)].reshape(-1, 3)
	np.random.shuffle(X)
	X = X[:500000] #
	non_white_x = np.sum(X**2, axis=-1) > white_threshold**2 
	
	if np.all(~non_white_x):
		c_est = np.ones((len(norms), c.shape[1]))
		c_est[norms > 1.e-2] -= c
		
		return np.zeros(img.shape[:2]), 0, np.zeros(img.shape[:2]), np.uint8(np.maximum(0, np.minimum(1, c_est))*255)
	
	#estimating number of independent components
	infos = []
	icamodels = []
	n_comp = 3 
	for j in range(n_comp):
		w0 = np.ones((1, 1))
		if j > 0:
			rot = np.ones((2, 2))*np.sqrt(0.5)
			rot[1, 0] *= -1
			w0 = np.eye(j + 1)
			for i in range(j):
				R = np.eye(j + 1)
				R[np.kron(np.arange(j + 1) != 2 - i, np.arange(j + 1) != 2 - i).reshape(R.shape)] = rot.ravel()
				w0 = w0.dot(R)

		ws = [np.eye(j + 1), w0]

		icas = []
		for k, w in enumerate(ws):
			ica = FastICA(j + 1, fun='exp', max_iter=500, w_init=w)
			
			res = ica.fit_transform(X)#
			
			if type(verbose) == str:
				res_ = ica.transform(img.reshape(-1, 3))
				
			ms = np.mean(res, axis=0)
			ss = np.std(res, axis=0)
			kln = 0
			for i, d in enumerate(res.T):
				q, bins = np.histogram(d, bins=50)
				q = np.asarray(q, dtype=float) + 1.
				q /= np.sum(q)
				 
				p = st.norm(ms[i], ss[i]).pdf(.5*(bins[:-1] + bins[1:]))
				p /= np.sum(p)
				
				kl = st.entropy(p, q) #
				
				kln += kl
				
			icas += [(kln, ica)]				
		
		info, ica = sorted(icas, key=lambda x : -x[0])[0]
		infos += [info/(j + 1)]
		icamodels += [ica]

	n_comp = min(c.shape[0], np.argmax(infos) + 1)
	rerun = True
	
	c_initial = c.copy()
	
	while rerun:
		rerun = False
		c = c_initial.copy()
		
		ica = icamodels[n_comp - 1]
		res = ica.transform(X)
		s0 = ica.transform(np.zeros((1, X.shape[1])))
		res -= s0

		adj_ica = icamodels[1] 
		dirs = adj_ica.mixing_.T.copy()
		dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis]
		icas = np.cross(dirs[0], dirs[1])
		
		#deciding on which expected components are present
		models = []
		for ind in it.combinations(range(c.shape[0]), n_comp):
			S = np.array([[np.corrcoef(np.vstack([es, ex]))[0, 1] for es in res.T] for ex in (X.dot(la.pinv(c[list(ind)]))).T])
			sc = np.abs(la.det(S))
			acs = 0
			if n_comp == 2 and c.shape[0] > 2:
				acs = np.abs(icas.dot(np.cross(c[ind[0]], c[ind[1]])))
				
			models += [(sc + 10*acs, S, ind)]

		if c.shape[0] != n_comp:
			stain_score, corrs, best_ind = sorted([(sc, S, ind) for sc, S, ind in models if 0 in ind], key = lambda x: x[0])[-1]
		else:
			stain_score, corrs, best_ind = models[0]
			
		best_ind = sorted(list(best_ind))
		
		# adjusting expected colours
		if 1 in best_ind: 
			adj_ind = [0, 1]
			
			adj_ica = icamodels[1] 
			dirs = adj_ica.mixing_.T.copy()
			dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis]
			icas = np.cross(dirs[0], dirs[1])
			
			rot_axis = c[adj_ind].mean(axis=0)
			rot_axis /= la.norm(rot_axis, 2)			
			
			angles = np.arange(-15., 16.)/180 * np.pi
			rotated = np.zeros((angles.shape[0], 3))
			rotCMY = np.zeros((angles.shape[0], len(adj_ind), 3))
		
			length = c[adj_ind].dot(rot_axis)
			project = c[adj_ind] - c[adj_ind].dot(rot_axis)[:, np.newaxis] * rot_axis[np.newaxis, :]
			e1 = project[np.argmax(np.sum(project**2, axis=1))].copy()
			e1 /= la.norm(e1, 2)
			e = np.array([e1, np.cross(rot_axis, e1)])
			project = e.dot(project.T)

			for i, a in enumerate(angles):
				A = e.T.dot(
					np.array([[np.cos(a), np.sin(a)]
							,[-np.sin(a), np.cos(a)]]))
				rotCMY[i] = (A.dot(project) + length[np.newaxis, :]*rot_axis[:, np.newaxis]).T
				rotated[i] = np.cross(rotCMY[i, 0], rotCMY[i, 1])

			acs = np.abs(rotated.dot(icas))
			
			c[adj_ind] = rotCMY[np.argmax(acs)]
			
			if verbose: 
				print "Adjusting expected colours: rotation angle = ", angles[np.argmax(acs)]/np.pi * 180
				
		
		# choosing the best decomposition
			
		ps = np.abs(corrs)
		P = np.zeros_like(ps)
		
		sh = np.array(P.shape) # - 1
		s = min(sh)
		best_score =  np.inf
		best_est = 0

		pr = prior[best_ind]

		X_ = X[non_white_x].T - ica.mean_[:, np.newaxis]

		dirs = ica.mixing_.T.copy()
		dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis]
		dist2 = np.array([np.sum((np.eye(3) - ci[:, np.newaxis].dot(ci[np.newaxis, :])).dot(X_)**2, axis=0) for ci in dirs])
		comps_prjs_ = dirs.dot(X_)
		
		pw = np.exp(-dist2/(2*0.05**2))*comps_prjs_		
		
		for i0 in it.combinations(range(max(sh)), max(sh) - min(sh)):
			sl = [r for r in range(sh[0]) if r not in i0]
			for i1 in it.permutations(range(s), s):
				Q = np.zeros(sh)
				I = np.eye(s)
				Q[sl] = I[list(i1)]
				
				for i2 in range(s + 1):
					for i3 in it.combinations(range(s), i2):
						ones_ = np.ones(s)
						ones_[list(i3)] *= -1
						R = Q * ones_
				
						comps = R.dot(dirs) 
						
						comps_prjs = R.dot(comps_prjs_)		
						
						est = np.diag(comps_prjs[np.arange(comps_prjs.shape[0]), np.argmax(R.dot(pw), axis = -1)]).dot(comps) + ica.mean_ 
						est /= np.sqrt(np.sum(est**2, axis=-1))[:, np.newaxis]
						
						Dist = np.sqrt(np.sum((c[best_ind][(np.arange(len(best_ind)**2)%len(best_ind)).reshape((len(best_ind),)*2).T.ravel()] 
											- est[np.arange(len(best_ind)**2)%len(best_ind)])**2, axis=-1)).reshape((len(best_ind),)*2) 

						if Dist.shape[0] > 1:
							sc = np.mean(np.array([Dist[I[:, i4] == 1, i4]**2/np.mean(Dist[I[:, i4] == 0, i4]) for i4 in range(Dist.shape[0])]).ravel()*pr)
						else:
							sc = np.mean(np.array([Dist[I[:, i4] == 1, i4]**2 for i4 in range(Dist.shape[0])]).ravel()*pr)
							
							
						if sc < best_score:
							best_score = sc
							P = R
							best_est = est
				
		_, d, _ = la.svd(c)
		
		mean_check = np.maximum(0, (ica.mean_/la.norm(ica.mean_, 2)).dot(la.pinv(c)))
		mean_check = mean_check/np.sum(mean_check)
		
		if( d[-1]/d[0] > 0.05 and mean_check[1] > 5*mean_check[np.arange(mean_check.shape[0]) != 1].max()
			and np.all(np.abs(best_est - (ica.mean_/la.norm(ica.mean_, 2))[np.newaxis, :]) < white_threshold) ):
			conf = 0
			if 1 not in best_ind:
				best_ind = sorted(best_ind + [1])
				best_est_ = np.zeros((best_est.shape[0] + 1, best_est.shape[1]))
				best_est_[1:] = best_est
				best_est = best_est_
				n_comp += 1
			best_est[0] = np.zeros(3)
				
		#decomposing image and checking the result
		res = img.reshape(-1, 3).dot(la.pinv(best_est))
		
		_, d, _ = la.svd(best_est) 
		if n_comp == 3 and d[-1]/d[0] < 0.05 and la.norm(best_est[0], 2) > 0:
			infos[-1] = 0
			n_comp = np.argmax(infos) + 1
			rerun = True
			if verbose: 
				print "Rerun on unstable decomposition"
				print "singular values", d, d/d[0]
				print "infos", infos, n_comp
			
			continue

		if n_comp == 3 and len(np.where(np.vstack([c[1], best_est[1]]).dot(np.cross(best_est[0], best_est[2])) < 0)[0]) % 2 == 1:
			best_ind = [0, 2]
			best_est = best_est[best_ind]
			n_comp = 2
			res = img.reshape(-1, 3).dot(la.pinv(best_est))
			if verbose :
				print 'Dropping the pigment due to inconsistency'
		
		stain_m = c[ 0] - np.ones(3)/la.norm(np.ones(3), 2)
		stain_m /= la.norm(stain_m, 2)
		backg_m = c[-1] - np.ones(3)/la.norm(np.ones(3), 2)
		backg_m /= la.norm(backg_m, 2)
			
		
		def mode(data):
			h, b = np.histogram(data, bins=50)
			m = np.argmax(h)
			return .5*(b[m] + b[m+1])
		
		if( n_comp == 1 and mode(res[:, 0].reshape(img.shape[:2])[mask == 0]) > 1.0*mode(res[:, 0].reshape(img.shape[:2])[mask == 1]) and len(c) > 1
			and stain_m.dot(backg_m) < .95 ):
			infos[0] = 0
			n_comp = min(c.shape[0], np.argmax(infos) + 1)
			rerun = True
			if verbose: 
				print "Rerun on weak stain"
				print "stain_m.dot(backg_m)", stain_m.dot(backg_m)
			
			continue
				
		#Checking if pigment is confused with saturated stain
		if n_comp > 1 and 1 in best_ind:
			best_est_ = best_est.copy()
			best_est_[0] = np.ones(3)/np.sqrt(3)
			
			pigm_ = img.reshape(-1, 3).dot(la.pinv(best_est))
			pigm = pigm_[mask.ravel() == 1][:, 1]
			
			th = np.percentile(pigm_[mask.ravel() == 1, 1], 99)
			sqlens = (img**2).reshape(-1, 3).sum(axis=-1).reshape(img.shape[:2])
			darkest = img[(mask == 1) & (sqlens >= np.percentile(sqlens[mask == 1], 99.5))].reshape(-1, 3)
			
			pigm_ = img.reshape(-1, 3).dot(la.pinv(best_est_))

			_, d_, _ = la.svd(best_est_)
			
			dark_part = np.zeros(img.shape[:2])
			dark_part[sqlens >= np.sqrt(3)*(1 - white_threshold)] = sqlens[sqlens >= np.sqrt(3)*(1 - white_threshold)]
			dark_part[mask == 0] = 0
			
			plt.figure()
			ax=plt.subplot(121)
			toshow = np.zeros(img.shape[:2])
			toshow[mask == 1] = pigm
			ax.imshow(toshow)
			ax=plt.subplot(122)
			ax.imshow(dark_part)		
			
			if (d_[-1]/d_[0] < 0.01 or 
					np.corrcoef(np.vstack([pigm, dark_part[mask == 1].ravel()]))[0,1] > 0.75
					) and np.any(np.abs(1 - darkest.mean(axis=0)) < white_threshold):
				best_ind = [ind for ind in best_ind if ind != 1]
				best_est = best_est[best_ind]
				n_comp = len(best_ind)
				res = img.reshape(-1, 3).dot(la.pinv(best_est))
				if verbose:
					print 'Dropping pigment due to saturated stain'
			
		check = np.maximum(0, best_est[0].dot(la.pinv(c[best_ind])))
		conf = check[0]/np.sum(check)
	
	if type(verbose) == str:
		data = X
		steps = 15
		stainHist, _ = np.histogramdd(data, bins = [np.arange(np.min(data), np.max(data) + (np.max(data) - np.min(data))/steps, (np.max(data) - np.min(data))/steps)]*3)
		colors = []
		sizes = []
		for i in range(steps):
			for j in range(steps):
				for k in range(steps):
					colors += [[(i + 0.5)*((np.max(data) - np.min(data))/steps) + np.min(data)
							  , (j + 0.5)*((np.max(data) - np.min(data))/steps) + np.min(data)
							  , (k + 0.5)*((np.max(data) - np.min(data))/steps) + np.min(data)]]
					sizes += [stainHist[i, j, k]]

		colorsCMY = np.array(colors)
		sizes = np.array(sizes)
		colorsRGB = 1. - colorsCMY
		
		fig = plt.figure(figsize=(24, 20))
		ax = fig.add_subplot(111, projection='3d')
		ax.scatter(colorsCMY[:, 0][(sizes > 0) ]
				 , colorsCMY[:, 1][(sizes > 0) ]
				 , colorsCMY[:, 2][(sizes > 0) ]
				 , s=np.log(sizes[ (sizes > 0) ] + 1)*50
				 , c = colorsRGB[  (sizes > 0) ]) 
		limits = (ax.get_xlim(), ax.get_ylim(), ax.get_zlim())
		
		ax.scatter([ica.mean_[0]], [ica.mean_[1]], [ica.mean_[2]], c = 'k', marker='+')
		p0, p1 = (ica.mean_ - .67*ica.mixing_.dot(P.T).T[0]/la.norm(ica.mixing_.dot(P.T).T[0], 2)), (ica.mean_ + .67*ica.mixing_.dot(P.T).T[0]/la.norm(ica.mixing_.dot(P.T).T[0], 2)) 
		ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1) 
		if ica.mixing_.shape[1] > 1:
			p0, p1 = (ica.mean_ - .67*ica.mixing_.dot(P.T).T[1]/la.norm(ica.mixing_.dot(P.T).T[1], 2)), (ica.mean_ + .67*ica.mixing_.dot(P.T).T[1]/la.norm(ica.mixing_.dot(P.T).T[1], 2)) 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1) 
		if ica.mixing_.shape[1] > 2:
			p0, p1 = (ica.mean_ - .67*ica.mixing_.dot(P.T).T[2]/la.norm(ica.mixing_.dot(P.T).T[2], 2)), (ica.mean_ + .67*ica.mixing_.dot(P.T).T[2]/la.norm(ica.mixing_.dot(P.T).T[2], 2)) 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1)  
				
		ax.set_xlim(limits[0])
		ax.set_ylim(limits[1])
		ax.set_zlim(limits[2])
		ax.set_xlabel('C')
		ax.set_ylabel('M')
		ax.set_zlabel('Y')

		pca2 = PCA(2).fit(X)
		n = np.cross(pca2.components_[0], pca2.components_[1])
		n /= la.norm(n, 2)
		
		az = np.arctan2(n[0], n[1])*90./np.pi
		el = np.arccos(n[2])*90./np.pi
		ax.view_init(elev=el, azim=az)

		ver_name = verbose+'.independent_axes.jpg'
		fig.savefig(ver_name)
		
		print "saving", ver_name
		
		dirs = ica.mixing_.T.copy()
		dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis]
		dist2 = np.array([np.sum((np.eye(3) - ci[:, np.newaxis].dot(ci[np.newaxis, :])).dot(colorsCMY.T - ica.mean_[:, np.newaxis])**2, axis=0) for ci in dirs])
		
		pw_ = np.exp(-dist2/(2*0.05**2))	
		
		pwth = 0.5
		
		fig = plt.figure(figsize=(24, 20))
		ax = fig.add_subplot(111, projection='3d')
		ax.scatter(colorsCMY[:, 0][(sizes > 0) & np.any(pw_ > pwth, axis=0)]
				 , colorsCMY[:, 1][(sizes > 0) & np.any(pw_ > pwth, axis=0)]
				 , colorsCMY[:, 2][(sizes > 0) & np.any(pw_ > pwth, axis=0)]
				 , s=np.log(sizes[ (sizes > 0) & np.any(pw_ > pwth, axis=0)] + 1)*50
				 , c = colorsRGB[  (sizes > 0) & np.any(pw_ > pwth, axis=0)]) 
		
		R = np.eye(P.shape[0])
		if P.shape[0] > n_comp:
			R[n_comp:] = 0

		R[0] *= -1
		
		if 0 in best_ind:
			est = np.diag(comps_prjs_[np.arange(comps_prjs_.shape[0]), np.argmax(R.dot(pw), axis = -1)]).dot(dirs) + ica.mean_ 
			est /= np.sqrt(np.sum(est**2, axis=-1))[:, np.newaxis]

			p0, p1 = np.zeros(3) , est[0] 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) 
			if n_comp > 1:
				p0, p1 = np.zeros(3) , est[1] 
				ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) 
			if n_comp > 2:
				p0, p1 = np.zeros(3) , est[2] 
				ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2)  

		ax.scatter([ica.mean_[0]], [ica.mean_[1]], [ica.mean_[2]], c = 'k', marker='+')
		p0, p1 = (ica.mean_ - .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)), (ica.mean_ + .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)) 
		ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) 
		if ica.mixing_.shape[1] > 1:
			p0, p1 = (ica.mean_ - .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)), (ica.mean_ + .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)) 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) 
		if ica.mixing_.shape[1] > 2:
			p0, p1 = (ica.mean_ - .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)), (ica.mean_ + .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)) 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67)  

		class Arrow3D(FancyArrowPatch):
			def __init__(self, xs, ys, zs, *args, **kwargs):
				FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
				self._verts3d = xs, ys, zs

			def draw(self, renderer):
				xs3d, ys3d, zs3d = self._verts3d
				xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
				self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
				FancyArrowPatch.draw(self, renderer)

		p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[0]/la.norm(ica.mixing_.dot(R.T).T[0], 2))
		a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b")
		ax.add_artist(a)
		if ica.mixing_.shape[1] > 1:
			p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[1]/la.norm(ica.mixing_.dot(R.T).T[1], 2)) 
			a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b")
			ax.add_artist(a)
		if ica.mixing_.shape[1] > 2:
			p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[2]/la.norm(ica.mixing_.dot(R.T).T[2], 2)) 
			a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b")
			ax.add_artist(a)
				
		ax.set_xlim(limits[0])
		ax.set_ylim(limits[1])
		ax.set_zlim(limits[2])
		ax.set_xlabel('C')
		ax.set_ylabel('M')
		ax.set_zlabel('Y')

		ax.view_init(elev=el, azim=az)

		ver_name = verbose+'.proposing_colours.jpg'
		fig.savefig(ver_name)
		
		print "saving", ver_name

		fig = plt.figure(figsize=(24, 20))
		ax = fig.add_subplot(111, projection='3d')
		
		R = P
		if P.shape[0] > n_comp:
			R[n_comp:] = 0
		
		est = best_est 

		p0, p1 = np.zeros(3) , est[0] 
		ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2)
		if n_comp > 1:
			p0, p1 = np.zeros(3) , est[1] 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) 
		if n_comp > 2:
			p0, p1 = np.zeros(3) , est[2] 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2)  

		ax.scatter([ica.mean_[0]], [ica.mean_[1]], [ica.mean_[2]], c = 'k', marker='+')
		p0, p1 = (ica.mean_ - .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)), (ica.mean_ + .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)) 
		ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) 
		if ica.mixing_.shape[1] > 1:
			p0, p1 = (ica.mean_ - .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)), (ica.mean_ + .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)) 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) 
		if ica.mixing_.shape[1] > 2:
			p0, p1 = (ica.mean_ - .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)), (ica.mean_ + .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)) 
			ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67)  

		p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[0]/la.norm(ica.mixing_.dot(R.T).T[0], 2))
		a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b")
		ax.add_artist(a)
		if ica.mixing_.shape[1] > 1:
			p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[1]/la.norm(ica.mixing_.dot(R.T).T[1], 2)) 
			a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b")
			ax.add_artist(a)
		if ica.mixing_.shape[1] > 2:
			p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[2]/la.norm(ica.mixing_.dot(R.T).T[2], 2)) 
			a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b")
			ax.add_artist(a)
				
		ax.plot([0, c[best_ind[0], 0]], [0., c[best_ind[0], 1]], [0., c[best_ind[0], 2]], c = 1-c[best_ind[0]], ls='--', linewidth=2)
		if c[best_ind].shape[0] > 1:
			ax.plot([0, c[best_ind[1], 0]], [0., c[best_ind[1], 1]], [0., c[best_ind[1], 2]], c = 1-c[best_ind[1]], ls='--', linewidth=2)
		if c[best_ind].shape[0] > 2:
			ax.plot([0, c[best_ind[2], 0]], [0., c[best_ind[2], 1]], [0., c[best_ind[2], 2]], c = 1-c[best_ind[2]], ls='--', linewidth=2)

		ax.set_xlim(limits[0])
		ax.set_ylim(limits[1])
		ax.set_zlim(limits[2])
		ax.set_xlabel('C')
		ax.set_ylabel('M')
		ax.set_zlabel('Y')

		ax.view_init(elev=el, azim=az)

		ver_name = verbose+'.classifying_colours.jpg'
		fig.savefig(ver_name)
		
		print "saving", ver_name

	if verbose :
		print "infos", infos
		print "check", check
		print 'conf', conf
		
		
	pigm = np.zeros_like(res[:, 0])
	if 1 in best_ind:
		pigm = res[:, 1]
		
	c_est = np.ones((len(norms), c.shape[1]))
	c_est_ = np.ones_like(c)
	c_est_[best_ind] -= best_est
	c_est[norms > 1.e-2] = c_est_
	
	return res[:, 0].reshape(img.shape[:2]), conf, pigm.reshape(img.shape[:2]), np.uint8(np.maximum(0, np.minimum(1, c_est))*255)
示例#42
0
def fit_and_score(expression_filename, treatment_filename, golden_filename):

    expressions = csv_map(
        expression_filename,
        header_method=lambda j, entry: genecoder.encode(entry),
        entry_method=lambda i, j, entry, d: d.__setitem__((i, j), entry),
        cleanup_method=collect_in_array)

    from scipy.stats import pearsonr, spearmanr

    correlations = np.zeros((genecoder.total_seen(), genecoder.total_seen()),
                            dtype=np.float64)

    for i in range(genecoder.total_seen()):
        for j in range(i, genecoder.total_seen()):
            # correlate columns of each gene pair in expression matrix
            if i == j:
                correlations[i, i] = 1.0
                continue
            (r, pval) = pearsonr(expressions[:, i], expressions[:, j])
            correlations[i, j] = r
            correlations[j, i] = r

    # Build dimension-reduced model of correlation data

    from sklearn.decomposition import FastICA

    nmf = FastICA(n_components=120)

    transformed_correlations = nmf.fit_transform(correlations)

    # print(expressions)
    print('Expressions shape: ', expressions.shape)

    nne = NearestNeighbors(n_neighbors=5,
                           radius=0.1,
                           algorithm='auto',
                           metric='manhattan',
                           n_jobs=4)

    nne.fit(transformed_correlations)

    # Build nearest-neighbor index of chip treatments

    treatments = csv_map(treatment_filename,
                         row_method=treatment_row_method,
                         cleanup_method=collect_treatments)

    nnt = NearestNeighbors(n_neighbors=5,
                           radius=0.1,
                           algorithm='auto',
                           metric='jaccard',
                           n_jobs=4)

    nnt.fit(treatments)

    # print(treatments)
    print("Treatments shape: ", treatments.shape)

    sparse_goldens, golden_i_indices, golden_j_indices = csv_map(
        golden_filename,
        row_method=golden_row_method,
        cleanup_method=collect_goldens)

    goldens = sparse_goldens.toarray()

    # print(goldens)

    from sklearn.mixture import BayesianGaussianMixture

    def correlation_modes(ex):
        modes = BayesianGaussianMixture(n_components=3)
        modes.fit(ex.flatten().reshape(-1, 1))
        expression_centers = modes.means_
        (anticorrelated, uncorrelated, correlated) = sorted(expression_centers)
        return (anticorrelated, uncorrelated, correlated)

    (anticorrelated, uncorrelated, correlated) = [-1, 0, 1]
    print("Correlation level modes: ", anticorrelated, uncorrelated,
          correlated)

    # predict the goldens
    #  - compute overall correlation of gene expressions across all experiments
    #  - transform the correlation data to the nmf space
    #  - synthesize a probe vector by setting that gene's element to its max correlation level in the data and rest to zero
    #  - get nearest neighbors of probe in nmf and treatment spaces (must be near in both)
    #  - average together nmf representations of those rows
    #  - transform back to expression space and threshold; these are predictions
    #  - compute AUROC vs golden array

    predicted_correlations = zeros(
        (genecoder.total_seen(), genecoder.total_seen()), dtype=np.float64)
    predicted_relationships = zeros(
        (genecoder.total_seen(), genecoder.total_seen()), dtype=np.bool_)

    for i in range(genecoder.total_seen()):
        genevector = zeros((1, genecoder.total_seen()))
        genevector[0, i] = np.max(expressions[:, i])
        transformed_genevector = nmf.transform(genevector)
        common_inds = []
        ex_neighbors = 5
        t_neighbors = 3
        (nmf_dist, nmf_neighbor_inds) = nne.kneighbors(
            transformed_genevector, min(expressions.shape[0], ex_neighbors),
            True)
        # (cnd_dist, cnd_neighbor_inds) = nnt.kneighbors(treatments[nmf_neighbor_inds], min(treatments.shape[0], t_neighbors), True)
        # common_inds = np.intersect1d(nmf_neighbor_inds, cnd_neighbor_inds, assume_unique=False)
        common_inds = nmf_neighbor_inds

        rows_to_average = transformed_correlations.take(common_inds, axis=0)
        average_transformed_correlation = np.average(rows_to_average,
                                                     axis=1)[0]
        if i % 100 == 0:
            stdout.write("\nAveraging transformed expressions for row %d." % i)
            stdout.flush()
        else:
            stdout.write('.')
            stdout.flush()
        # print("Average transformed correlation for row %d: \n" % i, average_transformed_correlation.shape)
        average_correlation_prediction = nmf.inverse_transform(
            [average_transformed_correlation])
        # print("\nMax predicted correlation vector component: ", max(average_expression_prediction))
        predicted_correlations[i] = average_correlation_prediction

    golden_nonzero_count = np.count_nonzero(goldens.flatten())

    def topcomponents(vec, num_components=3):
        return sorted(enumerate(vec), key=lambda x: x[1],
                      reverse=True)[0:num_components]

    golden_i_set = set(golden_i_indices)
    golden_j_set = set(golden_j_indices)
    print("Golden i set size: %d" % len(golden_i_set))

    for j in range(predicted_correlations.shape[1]):
        for i in range(predicted_correlations.shape[0]):
            p = predicted_correlations[i, j]
            r = True if abs(correlated - p) < abs(uncorrelated - p) or abs(
                anticorrelated - p) < abs(uncorrelated - p) else False
            predicted_relationships[i, j] = r

    # print(predicted_relationships)

    auroc = roc_auc_score(
        goldens[golden_i_indices, golden_j_indices],
        predicted_relationships[golden_i_indices, golden_j_indices])
    print("AUROC: ", auroc)

    print('Golden nonzero count: ', golden_nonzero_count)
    print(
        'Prediction nonzero count on golden set: ',
        np.count_nonzero(predicted_relationships[golden_i_indices,
                                                 golden_j_indices]))
    print('Prediction nonzero count on all genes: ',
          np.count_nonzero(predicted_relationships.flatten()))
def main(addNoise = 0, savedir = None, doFastICA = False):
    N = 200
    tt = linspace(0, 10, N)

    # make sources
    s1 = 4 + cos(tt*5)
    s2 = tt % 2

    s1 -= mean(s1)
    s1 /= std(s1)
    s2 -= mean(s2)
    s2 /= std(s2)

    pyplot.figure(1)
    pyplot.subplot(4,1,1)
    pyplot.title('original sources')
    pyplot.plot(tt, s1, 'bo-')
    pyplot.subplot(4,1,2)
    pyplot.plot(tt, s2, 'bo-')

    A = array([[3, 1], [-2, .3]])

    S = vstack((s1, s2)).T
    #print 'S', S
    print 'kurt(s1) =', kurt(s1)
    print 'kurt(s2) =', kurt(s2)
    print ' negentropy(s1) =', negentropy(s1)
    print ' negentropy(s2) =', negentropy(s2)
    print ' logcosh10(s1) =', logcosh10(s1)
    print ' logcosh10(s2) =', logcosh10(s2)
    print ' logcosh15(s1) =', logcosh15(s1)
    print ' logcosh15(s2) =', logcosh15(s2)
    print ' logcosh20(s1) =', logcosh20(s1)
    print ' logcosh20(s2) =', logcosh20(s2)
    print ' negexp(s1) =', negexp(s1)
    print ' negexp(s2) =', negexp(s2)
    
    X = dot(S, A)

    if addNoise > 0:
        print 'Adding noise!'
        X += random.normal(0, addNoise, X.shape)
    
    #print 'X', X

    x1 = X[:,0]
    x2 = X[:,1]

    #print 'kurt(x1) =', kurt(x1)
    #print 'kurt(x2) =', kurt(x2)

    pyplot.subplot(4,1,3)
    pyplot.title('observed signal')
    pyplot.plot(tt, x1, 'ro-')
    pyplot.subplot(4,1,4)
    pyplot.plot(tt, x2, 'ro-')

    pyplot.figure(2)
    pyplot.subplot(4,1,1)
    pyplot.title('original sources')
    pyplot.hist(s1)
    pyplot.subplot(4,1,2)
    pyplot.hist(s2)
    pyplot.subplot(4,1,3)
    pyplot.title('observed signal')
    pyplot.hist(x1)
    pyplot.subplot(4,1,4)
    pyplot.hist(x2)

    pca = PCA(X)

    #W = pca.toWhitePC(X)
    W = pca.toZca(X)

    w1 = W[:,0]
    w2 = W[:,1]

    print 'kurt(w1) =', kurt(w1)
    print 'kurt(w2) =', kurt(w2)

    pyplot.figure(3)
    pyplot.subplot(4,2,1)
    pyplot.title('observed signal')
    pyplot.hist(x1)
    pyplot.subplot(4,2,3)
    pyplot.hist(x2)
    pyplot.subplot(2,2,2)
    pyplot.plot(x1, x2, 'bo')

    pyplot.subplot(4,2,5)
    pyplot.title('whitened observed signal')
    pyplot.hist(w1)
    pyplot.subplot(4,2,7)
    pyplot.hist(w2)
    pyplot.subplot(2,2,4)
    pyplot.plot(w1, w2, 'bo')

    # Compute kurtosis at different angles
    thetas = linspace(0, pi, 100)
    kurt1 = 0 * thetas
    for ii, theta in enumerate(thetas):
        kurt1[ii] = kurt(dot(rotMat(theta)[0,:], W.T).T)


    # functions of data
    minfnK    = lambda data: -kurt(data)**2
    minfnNEnt = lambda data: -negentropy(data)
    minfnLC10 = lambda data: -logcosh10(data)
    minfnLC15 = lambda data: -logcosh15(data)
    minfnLC20 = lambda data: -logcosh20(data)
    minfnNExp = lambda data: -negexp(data)

    # functions of the rotation angle, given W as the data
    minAngleFnK    = lambda theta: minfnK(dot(rotMat(theta)[0,:], W.T).T)
    minAngleFnNEnt = lambda theta: minfnNEnt(dot(rotMat(theta)[0,:], W.T).T)
    minAngleFnLC10 = lambda theta: minfnLC10(dot(rotMat(theta)[0,:], W.T).T)
    minAngleFnLC15 = lambda theta: minfnLC15(dot(rotMat(theta)[0,:], W.T).T)
    minAngleFnLC20 = lambda theta: minfnLC20(dot(rotMat(theta)[0,:], W.T).T)
    minAngleFnNExp = lambda theta: minfnNExp(dot(rotMat(theta)[0,:], W.T).T)

    #########
    # Chosen objective function. Change this line to change which objective is used.
    #########
    minDataFn = minfnK 

    minAngleFn = lambda theta: minDataFn(dot(rotMat(theta)[0,:], W.T).T)

    if doFastICA:
        # Use FastICA from sklearn
        #pdb.set_trace()
        from sklearn.decomposition import FastICA
        rng = random.RandomState(1)
        ica = FastICA(random_state = rng, whiten = False)
        ica.fit(W)
        Recon = ica.transform(W)  # Estimate the sources
        #S_fica /= S_fica.std(axis=0)   # (should already be done)
        Ropt = ica.get_mixing_matrix()
    else:
        # Manually fit angle using fmin_bfgs
        angle0 = 0
        xopt = fmin_bfgs(minAngleFn, angle0)
        xopt = xopt[0] % pi
        Ropt = rotMat(xopt)
        Recon = dot(W, Ropt.T)

    mnval = array([minAngleFn(aa) for aa in thetas])

    pyplot.figure(4)
    pyplot.title('objective vs. angle')
    #pyplot.plot(thetas, kurt1, 'bo-', thetas, mnval, 'k', xopt, minAngleFn(xopt), 'ko')
    pyplot.plot(thetas, mnval, 'b')
    if not doFastICA:
        pyplot.hold(True)
        pyplot.plot(xopt, minAngleFn(xopt), 'ko')

    pyplot.figure(5)
    pyplot.title('different gaussianness measures vs. angle')
    pyplot.subplot(6,1,1); pyplot.title('Kurt'); pyplot.plot(thetas, array([minAngleFnK(aa) for aa in thetas]))
    pyplot.subplot(6,1,2); pyplot.title('NegEnt'); pyplot.plot(thetas, array([minAngleFnNEnt(aa) for aa in thetas]))
    pyplot.subplot(6,1,3); pyplot.title('LogCosh10'); pyplot.plot(thetas, array([minAngleFnLC10(aa) for aa in thetas]))
    pyplot.subplot(6,1,4); pyplot.title('LogCosh15'); pyplot.plot(thetas, array([minAngleFnLC15(aa) for aa in thetas]))
    pyplot.subplot(6,1,5); pyplot.title('LogCosh20'); pyplot.plot(thetas, array([minAngleFnLC20(aa) for aa in thetas]))
    pyplot.subplot(6,1,6); pyplot.title('NegExp'); pyplot.plot(thetas, array([minAngleFnNExp(aa) for aa in thetas]))
    
    print 'kurt(r1) =', kurt(Recon[:,0])
    print 'kurt(r2) =', kurt(Recon[:,1])

    print
    print 'objective(s1) =', minDataFn(s1)
    print 'objective(s2) =', minDataFn(s2)
    print 'objective(w1) =', minDataFn(w1)
    print 'objective(w2) =', minDataFn(w2)
    print 'objective(r1) =', minDataFn(Recon[:,0])
    print 'objective(r2) =', minDataFn(Recon[:,1])
    print 'optimal theta:',
    if doFastICA:
        print '<not computed with FastICA>'
    else:
        print xopt, '(+pi/2 =', (xopt+pi/2)%pi, ')'
    print 'Optimal rotation matrix:\n', Ropt

    pyplot.figure(6)
    pyplot.subplot(4,1,1)
    pyplot.title('original sources')
    pyplot.plot(tt, s1, 'bo-')
    pyplot.subplot(4,1,2)
    pyplot.plot(tt, s2, 'bo-')
    pyplot.subplot(4,1,3)
    pyplot.title('reconstructed sources')
    pyplot.plot(tt, Recon[:,0], 'go-')
    pyplot.subplot(4,1,4)
    pyplot.plot(tt, Recon[:,1], 'go-')

    #pyplot.show()

    if savedir:
        figname = lambda ii : os.path.join(savedir, 'figure_%02d.png' % ii)
        for ii in range(6):
            pyplot.figure(ii+1)
            pyplot.savefig(figname(ii+1))
        print 'plots saved in', savedir
    else:
        import ipdb; ipdb.set_trace()
示例#44
0
def main(mode):
    path = '/local/attale00/AFLW_ALL'
    path_ea = path+'/color128/'
    
    fileNames = utils.getAllFiles(path_ea);
    
    
    
    
    labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels')
    
    
    
    testSet = fg.dataContainer(labs)
    
    testSetMirror = fg.dataContainer(labs)
    for f in range(len(testSetMirror.fileNames)):
        testSetMirror.fileNames[f]+='M'
    
    
    roi=(50,74,96,160)
 
 

    X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(128,256),roi=roi)
    Y=fg.getAllImagesFlat(path+'/mirror128/',testSet.fileNames,(128,256),roi=roi)
    Z=np.concatenate((X,Y),axis=0)
    # perform ICA
    ica = FastICA(n_components=100,whiten=True)
    ica.fit(Z)
    meanI=np.mean(Z,axis=0)
    X1=X-meanI
    Y1=Y-meanI    
    data=ica.transform(X1)
    datam=ica.transform(Y1)
    filters=ica.components_
    for i in range(len(fileNames)):
        testSet.data[i].extend(data[i,:])
        testSetMirror.data[i].extend(datam[i,:])


    strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3))
    #fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 3, cells_per_block=(6,2),maskFromAlpha=False)
    #fg.getColorHistogram(testSet,roi,path=path_ea,ending='.png',colorspace='lab',bins=10)

  
    #pca
#    n_samples, n_features = X.shape
#
#    mean_ = np.mean(X, axis=0)
#    X -= mean_
#    U, S, V = linalg.svd(X)
#    explained_variance_ = (S ** 2) / n_samples
#    explained_variance_ratio_ = (explained_variance_ /explained_variance_.sum())
#    K=V / S[:, np.newaxis] * np.sqrt(n_samples)
#    filters=K[:100]
#    data=np.dot(X,filters.T)    
    
    testSet.addContainer(testSetMirror)
            
    
    testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target)
    rf=classifierUtils.standardRF(max_features = np.sqrt(len(testSet.data[0])),min_split=5,max_depth=40)
    if mode in ['s','v']:
        print 'Classifying with loaded classifier'
        _classifyWithOld(path,testSet,mode)
    elif mode in ['c']:
        print 'cross validation of data'
        classifierUtils.dissectedCV(rf,testSet)
    elif mode in ['save']:
        print 'saving new classifier'
        _saveRF(testSet,rf,filters=filters,meanI=meanI)
    else:
        print 'not doing anything'
示例#45
0
def nn2(xs, ys, xs_test, ys_test, n_components, clf_constructor):
    ks = [0 for _ in range(10)]
    cataccs = [0 for _ in range(10)]

    ys = [to_categorical(ys[0]), to_categorical(ys[1])]
    ys_test = [to_categorical(ys_test[0]), to_categorical(ys_test[1])]

    for i in range(2):
        shape = np.shape(xs[i])[1]
        n_components[i] = shape
        model = utils.create_adult_model(
            shape, 2) if i == 0 else utils.create_wine_model(shape, 5)
        model.fit(xs[i][:10000],
                  ys[i][:10000],
                  batch_size=50,
                  epochs=10,
                  verbose=False)
        cataccs[i] = model.evaluate(xs_test[i], ys_test[i],
                                    verbose=False)[1] * 100

    for k in range(2, 11):
        try:
            clf = clf_constructor(n_clusters=k)
        except:
            clf = clf_constructor(n_components=k)
        for i in range(2):
            pca = PCA(n_components=n_components[2 + i])
            transformed = pca.fit_transform(xs[i])
            transformed_test = pca.transform(xs_test[i])
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[2 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[2 + i]:
                ks[2 + i] = k
                cataccs[2 + i] = catacc

            ica = FastICA(n_components=n_components[4 + i])
            transformed = ica.fit_transform(xs[i])
            transformed_test = ica.transform(xs_test[i])
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[4 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[4 + i]:
                ks[4 + i] = k
                cataccs[4 + i] = catacc

            if i == 1:
                rp = GaussianRandomProjection(eps=0.95)
                transformed = rp.fit_transform(xs[i])
                transformed_test = rp.transform(xs_test[i])
                predict = to_categorical(clf.fit_predict(transformed[:10000]))
                predict_test = to_categorical(
                    clf.predict(transformed_test[:10000]))
                input_dims = [np.shape(transformed)[1], k]
                model = utils.create_mi_wine_model(input_dims, 5)
                model.fit([transformed[:10000], predict],
                          ys[i][:10000],
                          batch_size=50,
                          epochs=10,
                          verbose=False)
                catacc = model.evaluate([transformed_test, predict_test],
                                        ys_test[i],
                                        verbose=False)[1] * 100
                if catacc > cataccs[6 + i]:
                    ks[6 + i] = k
                    cataccs[6 + i] = catacc

            encoder, vae = utils.create_vae(
                np.shape(xs[i])[1], n_components[8 + i])
            vae.fit(xs[i], batch_size=50, epochs=10, verbose=False)
            transformed = encoder.predict(xs[i], verbose=False)
            transformed_test = encoder.predict(xs_test[i], verbose=False)
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[8 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[8 + i]:
                ks[8 + i] = k
                cataccs[8 + i] = catacc

    plot.style.use('seaborn-darkgrid')
    plot.title(f'Influence of feature transformation on the NN accuracy')
    color = []
    for _ in range(5):
        color.append('tab:blue')
        color.append('tab:orange')
    x = []
    count = 1
    for _ in range(5):
        x.append(count)
        count += 0.5
        x.append(count)
        count += 1
    plot.bar(x, cataccs, color=color, width=0.75)
    x = []
    count = 1.25
    for _ in range(5):
        x.append(count)
        count += 1.5
    plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE'])
    plot.xlabel('Feature transformation method')
    plot.ylabel('Categorical accuracy (%)')
    plot.show()
示例#46
0
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
示例#47
0
# PCA on Test Set
X_test_PCA = pca.transform(X_test)
X_test_PCA = pd.DataFrame(data=X_test_PCA, index=X_test.index)

X_test_PCA_inverse = pca.inverse_transform(X_test_PCA)
X_test_PCA_inverse = pd.DataFrame(data=X_test_PCA_inverse, \
                                  index=X_test.index)

scatterPlot(X_test_PCA, y_test, "PCA")

anomalyScoresPCA = anomalyScores(X_test, X_test_PCA_inverse)
preds = plotResults(y_test, anomalyScoresPCA, True)

# Independent Component Analysis on Test Set
X_test_fastICA = fastICA.transform(X_test)
X_test_fastICA = pd.DataFrame(data=X_test_fastICA, index=X_test.index)

X_test_fastICA_inverse = fastICA.inverse_transform(X_test_fastICA)
X_test_fastICA_inverse = pd.DataFrame(data=X_test_fastICA_inverse, \
                                      index=X_test.index)

scatterPlot(X_test_fastICA, y_test, "Independent Component Analysis")

anomalyScoresFastICA = anomalyScores(X_test, X_test_fastICA_inverse)
plotResults(y_test, anomalyScoresFastICA)

X_test_miniBatchDictLearning = miniBatchDictLearning.transform(X_test)
X_test_miniBatchDictLearning = \
    pd.DataFrame(data=X_test_miniBatchDictLearning, index=X_test.index)
示例#48
0
joblib.dump(sparse_pca_model, 'sparse_pca_model.pkl')
joblib.dump(sparse_pca_X_new, 'sparse_pca_X_new.pkl')
print sparse_pca_model

kernel_pca = KernelPCA(n_components=50)
kernel_pca_model = kernel_pca.fit(kernel_pca_data)
kernel_X_new = kernel_pca.fit_transform(X)
joblib.dump(kernel_pca_model, 'kernel_pca_model.pkl')
joblib.dump(kernel_X_new, 'kernel_X_new.pkl')

fast_ica = FastICA(n_components=None)
fast_ica_start = time.time()
fast_ica_model = fast_ica.fit(fast_ica_data)
fast_ica_end = time.time()
print 'fast_ica fit time', fast_ica_end - fast_ica_start
fast_ica_X_new = fast_ica.transform(X)
joblib.dump(fast_ica_model, 'fast_ica_model.pkl')
joblib.dump(fast_ica_X_new, 'fast_ica_X_new.pkl')
print fast_ica_model

'''
nmf = NMF(n_components=None)
nmf_start = time.time()
#nmf_model = nmf.fit(nmf_data)
nmf_X_new = nmf.fit_transform(X)
nmf_end = time.time()
print 'nmf fit time', nmf_end - nmf_start
#joblib.dump(nmf_model, 'nmf_model.pkl')
joblib.dump(nmf_X_new, 'nmf_X_new.pkl')
print nmf_model
'''
def main(mode):
    path = '/local/attale00/AFLW_ALL/'
    path_ea = '/local/attale00/AFLW_cropped/cropped3/'
#    
    fileNames = utils.getAllFiles(path_ea);
#    minr = 10000;
#    for f in fileNames:
#        im = cv2.imread(path_ea+f,-1)
#        if im.shape[0]!=40 or im.shape[1]!=120:
#            print f
#            print im.shape
#        minr = minr if im.shape[0]>= minr else im.shape[0]
#    
#    print minr
#    
    
    labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels')
    
    
    
    testSet = fg.dataContainer(labs)
    testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target)
    
    roi=(0,37,0,115)
    roi=None    
    #roi=(44,84,88,168)    
    
    
#    eM=np.load('/home/attale00/Desktop/mouthMask.npy')
#    m=cv2.resize(np.uint8(eM),(256,256));
#    strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3))
#    dil = cv2.dilate(m,strel)
#    
#    m=dil>0;


            
 
    #X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(40,120),roi=roi)
   
# perform ICA 
    names_open = []
    names_closed = []
    for i,f in enumerate(testSet.fileNames):
        if testSet.targetNum[i] == 0:
            names_closed.append(f)
        elif testSet.targetNum[i] == 1:
            names_open.append(f)
    Xopen = fg.getAllImagesFlat(path_ea,names_open,(40,120))
    XClosed = fg.getAllImagesFlat(path_ea,names_closed,(40,120))
    
    if mode not in ['s','v']:
        icaopen = FastICA(n_components=100,whiten=True)
        icaopen.fit(Xopen)
        meanIopen=np.mean(Xopen,axis=0)
        X1open=Xopen-meanIopen
        dataopen=icaopen.transform(X1open)
        filtersopen=icaopen.components_
        plottingUtils.showICAComponents(filtersopen,(40,120),4,4)
        icaclosed = FastICA(n_components=100,whiten=True)
        icaclosed.fit(XClosed)
        meanIclosed=np.mean(XClosed,axis=0)
        X1closed=XClosed-meanIclosed
        dataclosed=icaclosed.transform(X1closed)
        filtersclosed=icaclosed.components_        
        plottingUtils.showICAComponents(filtersclosed,(40,120),4,4)
        
        plt.show()
        
    elif mode in ['s','v']:
        W=np.load('/home/attale00/Desktop/classifiers/patches/filterMP1.npy')
        m=np.load('/home/attale00/Desktop/classifiers/patches/meanIMP1.npy')
        X1=X-m
        data=np.dot(X1,W.T)    
    
    for i in range(len(fileNames)):
            testSet.data[i].extend(data[i,:])


    strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3))
    #fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 5, cells_per_block=(3,3),pixels_per_cell=(24,8),maskFromAlpha=False)
    #fg.getColorHistogram(testSet,roi,path=path_ea,ending='.png',colorspace='lab',bins=20)

  
    #pca
#    n_samples, n_features = X.shape
#
#    mean_ = np.mean(X, axis=0)
#    X -= mean_
#    U, S, V = linalg.svd(X)
#    explained_variance_ = (S ** 2) / n_samples
#    explained_variance_ratio_ = (explained_variance_ /explained_variance_.sum())
#    K=V / S[:, np.newaxis] * np.sqrt(n_samples)
#    filters=K[:100]
#    data=np.dot(X,filters.T)    
    
   
            
    
    
    rf=classifierUtils.standardRF(max_features = 27,min_split=13,max_depth=40)
    #rf = svm.NuSVC()
    #rf = linear_model.SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None)    
    if mode in ['s','v']:
        print 'Classifying with loaded classifier'
        _classifyWithOld(path,testSet,mode)
    elif mode in ['c']:
        print 'cross validation of data'
        classifierUtils.dissectedCV(rf,testSet)
    elif mode in ['save']:
        print 'saving new classifier'
        _saveRF(testSet,rf,filters=filters,meanI=meanI)
    else:
        print 'not doing anything'
示例#50
0
X = pd.read_csv('stroop_data_698_698.csv', header=None)
y = np.vstack((np.ones((698, 1)), np.zeros((698, 1))))

rng = np.random.RandomState(4)

X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=rng)

X_train, X_cros, y_train, y_cros = train_test_split(X_, y_, test_size=0.25, random_state=rng)

ica = FastICA(random_state=rng, n_components=50, max_iter=10000)
X_ica = ica.fit(X_train).transform(X_train)

X_ica /= X_ica.std(axis=0)

# Preparing crossvalidation data for each component
X_cros_ica = ica.transform(X_cros)

X_cros_ica /= X_cros_ica.std(axis=0)

components = []

logreg = linear_model.LogisticRegression(C=1e5)

for i in range(0, 30):

    target_component = X_ica[:, i].reshape(X_ica.shape[0], 1)

    logreg.fit(target_component, y_train.ravel())

    X_cros_final = X_cros_ica[:, i].reshape(X_cros_ica.shape[0], 1)
示例#51
0
def test_fastica_simple(add_noise, global_random_seed, global_dtype):
    # Test the FastICA algorithm on very simple data.
    rng = np.random.RandomState(global_random_seed)
    n_samples = 1000
    # Generate two sources:
    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
    s2 = stats.t.rvs(1, size=n_samples, random_state=global_random_seed)
    s = np.c_[s1, s2].T
    center_and_norm(s)
    s = s.astype(global_dtype)
    s1, s2 = s

    # Mixing angle
    phi = 0.6
    mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]])
    mixing = mixing.astype(global_dtype)
    m = np.dot(mixing, s)

    if add_noise:
        m += 0.1 * rng.randn(2, 1000)

    center_and_norm(m)

    # function as fun arg
    def g_test(x):
        return x**3, (3 * x**2).mean(axis=-1)

    algos = ["parallel", "deflation"]
    nls = ["logcosh", "exp", "cube", g_test]
    whitening = ["arbitrary-variance", "unit-variance", False]
    for algo, nl, whiten in itertools.product(algos, nls, whitening):
        if whiten:
            k_, mixing_, s_ = fastica(
                m.T, fun=nl, whiten=whiten, algorithm=algo, random_state=rng
            )
            with pytest.raises(ValueError):
                fastica(m.T, fun=np.tanh, whiten=whiten, algorithm=algo)
        else:
            pca = PCA(n_components=2, whiten=True, random_state=rng)
            X = pca.fit_transform(m.T)
            k_, mixing_, s_ = fastica(
                X, fun=nl, algorithm=algo, whiten=False, random_state=rng
            )
            with pytest.raises(ValueError):
                fastica(X, fun=np.tanh, algorithm=algo)
        s_ = s_.T
        # Check that the mixing model described in the docstring holds:
        if whiten:
            # XXX: exact reconstruction to standard relative tolerance is not
            # possible. This is probably expected when add_noise is True but we
            # also need a non-trivial atol in float32 when add_noise is False.
            #
            # Note that the 2 sources are non-Gaussian in this test.
            atol = 1e-5 if global_dtype == np.float32 else 0
            assert_allclose(np.dot(np.dot(mixing_, k_), m), s_, atol=atol)

        center_and_norm(s_)
        s1_, s2_ = s_
        # Check to see if the sources have been estimated
        # in the wrong order
        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
            s2_, s1_ = s_
        s1_ *= np.sign(np.dot(s1_, s1))
        s2_ *= np.sign(np.dot(s2_, s2))

        # Check that we have estimated the original sources
        if not add_noise:
            assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-2)
            assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-2)
        else:
            assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-1)
            assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-1)

    # Test FastICA class
    _, _, sources_fun = fastica(
        m.T, fun=nl, algorithm=algo, random_state=global_random_seed
    )
    ica = FastICA(fun=nl, algorithm=algo, random_state=global_random_seed)
    sources = ica.fit_transform(m.T)
    assert ica.components_.shape == (2, 2)
    assert sources.shape == (1000, 2)

    assert_allclose(sources_fun, sources)
    assert_allclose(sources, ica.transform(m.T))

    assert ica.mixing_.shape == (2, 2)

    ica = FastICA(fun=np.tanh, algorithm=algo)
    with pytest.raises(ValueError):
        ica.fit(m.T)
示例#52
0
class TICAEmbedding(TimeSeriesEmbedding):
    """Embed time series using tICA
    
    Properties
    ----------
    
    time_lag : int
        The number of time steps to lag coordinates before embedding
    """
    def __init__(self, *args, time_lag=10, **kwargs):
        super().__init__(*args, **kwargs)
        self.time_lag = time_lag
        if time_lag > 0:
            self.model = tICA(n_components=self.n_latent, lag_time=time_lag)
        elif time_lag == 0:
            self.model = FastICA(n_components=self.n_latent,
                                 random_state=self.random_state)
        else:
            raise ValueError(
                "Time delay parameter must be greater than or equal to zero.")

    def fit(self, X, y=None, subsample=None):
        """Fit the model with a time series X

        Parameters
        ----------
        X : array-like, shape (n_timepoints, n_features)
            Training data, where n_timepoints is the number of timepoints
            and n_features is the number of features.

        y : None
            Ignored variable.

        subsample : int or None
            If set to an integer, a random number of timepoints is selected
            equal to that integer

        Returns
        -------
        X_new : array-like, shape (n_timepoints, n_components)
            Transformed values.
        """
        # Make hankel matrix from dataset
        Xs = standardize_ts(X)
        X_train = hankel_matrix(Xs, self.time_window)

        if subsample:
            self.train_indices, X_train = resample_dataset(
                X_train, subsample, random_state=self.random_state)
        if self.time_lag > 0:
            self.model.fit([np.reshape(X_train, (X_train.shape[0], -1))])
        else:
            self.model.fit(np.reshape(X_train, (X_train.shape[0], -1)))

    def transform(self, X, y=None):
        X_test = hankel_matrix(standardize_ts(X), self.time_window)
        X_test = np.reshape(X_test, (X_test.shape[0], -1))
        if self.time_lag > 0:
            X_new = self.model.transform([X_test])[0]
        else:
            X_new = self.model.transform(X_test)
        return X_new
start_time = time.time()

# Load the data
from income_data import X_train, X_test, y_train, y_test

# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_toTransform = X_train

# Reduce Dimensionality
projection = ProjectionAlgorithm(n_components=29)
X_transformed = projection.fit_transform(X_train)
X_testTransformed = projection.transform(X_test)

# Run em clustering with 2 clusters and plot
cluster = GaussianMixture(random_state=0, n_components=91).fit(X_transformed)
cluster_labels = cluster.predict(X_transformed)
X_transformed = np.dot(X_transformed, np.transpose(cluster.means_))
X_testTransformed = np.dot(X_testTransformed, np.transpose(cluster.means_))

# Define the classifier
nn = MLPClassifier(solver='lbfgs',
                   random_state=1,
                   alpha=0.005,
                   hidden_layer_sizes=3)
grid_params = {'alpha': [0.005], 'hidden_layer_sizes': [3]}
clf = GridSearchCV(nn, param_grid=grid_params, cv=3)
class Preprocess:

    def __init__(self, pca_model=None, all_dat=None):

        if pca_model is not None:
            self.pca = joblib.load(pca_model)  # try 'eco_full_pca.pkl'

        self.full_tab = pd.read_json("../data.json")
        self.full_tab["rem_nrg"] = self.full_tab.apply(lambda x: self.remaining_energy(x.score), axis=1)

        if all_dat is not None:
            self.all_dat = joblib.load(all_dat)  # try 'all_games.pkl'
            drop = np.any(self.all_dat, axis=1)
            self.all_dat = self.all_dat[drop]
            self.full_tab = pd.read_json("../data.json")[drop]
            self.full_tab["rem_nrg"] = self.full_tab.apply(lambda x: self.remaining_energy(x.score), axis=1)
            self.proj = None
        # print os.system('pwd')

    @staticmethod
    def remaining_energy(consumption):
        max_batt = 0.55
        # consumption = np.linspace(0,2000000)
        # print consumption
        if consumption == -1:
            return 0
        else:
            return 100-(consumption/36000/max_batt)


    def totuple(self, a):
        try:
            return tuple(self.totuple(i) for i in a)
        except TypeError:
            return a

    def full_vec(self, pos, sig, size):
        series=np.zeros((size,), dtype=np.int)
        try:
            for i,x in enumerate(pos[:-1]):
                series[x:pos[i+1]] = sig[i]
        except Exception:
            pass
        #print series
        return series

    def get_json(self, file):

        with open(file) as json_data:
            data = json.load(json_data)

        self.dat=pd.DataFrame.from_dict(data['alluser_control'])
        self.dat["series"] = self.dat.apply(lambda x: self.totuple(self.full_vec(x['x'], x['sig'], 18160)),
                                  axis=1, raw=True)
        self.all_dat=np.empty((2391,18160))
        for i,x in enumerate(self.dat.x):
            self.all_dat[i,:]=self.full_vec(x, self.dat.sig[i], 18160)
        joblib.dump(self.all_dat, '../all_games.pkl')

    def train_pca(self, ndim=30):  # uses complete data-set
        # self.pca = TruncatedSVD(n_components=ndim)
        self.pca = FastICA(n_components=ndim)
        self.pca.fit(self.all_dat)
        joblib.dump(self.pca, '../eco_full_pca.pkl')  # save for later importing

    def ready_player_one(self, place):
        # place must be less than 7.
        top6 = [78, 122, 166, 70, 67, 69] #best players
        m1, m2, m3, m4, m5, m6 = [self.full_tab.userid.values == i for i in top6]
        masks = [m1, m2, m3, m4, m5, m6]
        X = self.all_dat[masks[place-1]]
        y = self.full_tab["rem_nrg"].values[masks[place-1]]

        X_pca = self.pca.transform(X)
        X_pca = np.vstack((X_pca.T, self.full_tab["finaldrive"].values[masks[place-1]])).T
        return (X_pca, y)

    def ready_bad_player(self):
        # mask = [self.full_tab.userid.values == 1]  # gets mediocre score (~12 plays
        mask = [self.full_tab.userid.values == 79]  # gets zero score (~12 plays)

        X = self.all_dat[mask]
        y = self.full_tab["rem_nrg"].values[mask]

        X_pca = self.pca.transform(X)
        X_pca = np.vstack((X_pca.T, self.full_tab["finaldrive"].values[mask])).T
        return (X_pca, y)

    def prep_by_id(self, play_no):
        id_no = self.full_tab['userid'][self.full_tab['id'] == play_no].values[0]
        # print id_no
        mask_a = self.full_tab.userid.values == id_no
        mask_b = self.full_tab.id.values <= play_no
        mask = np.logical_and(mask_a, mask_b)
        X = self.all_dat[mask]
        y = self.full_tab["rem_nrg"].values[mask]

        X_pca = self.pca.transform(X)
        X_pca = np.vstack((X_pca.T, self.full_tab["finaldrive"].values[mask])).T
        return (X_pca, y)
示例#55
0
s1 = np.sin(_x)  # 第1位演讲者的声音
s2 = _x % (np.pi) * k1 * k2 + (np.pi - _x % (np.pi)) * k1 * k3  # 第2位演讲者的声音
x1 = 0.4 * s1 + 0.5 * s2  # 录音1
x2 = 1.2 * s1 - 0.3 * s2  # 录音2

plt.subplot(121)
plt.plot(_x, s1, label='s1')
plt.plot(_x, s2, label='s2')
plt.legend()
plt.subplot(122)
plt.plot(_x, x1, label='x1')
plt.plot(_x, x2, label='x2')
plt.legend()
plt.show()

# 从合成信号x_1和x_2中分离出s_1和s_2这样的独立音源

X = np.stack((x1, x2), axis=1)  # 将两个信号合并成矩阵

fica = FastICA(n_components=2)  # 快速独立成分分析类实例化
fica.fit(X)

X_ica = fica.transform(X)  # 独立成分分析结果
print(X_ica.shape)  # (1000, 2)

plt.plot(_x, X_ica[:, 0], label='独立成分1')
plt.plot(_x, X_ica[:, 1], label='独立成分2')
plt.legend()
plt.show()
def main(mode):
    path = '/local/attale00/AFLW_ALL/'
    path_ea = '/local/attale00/AFLW_cropped/cropped3/'
#    
    fileNames = utils.getAllFiles(path_ea);
#    minr = 10000;
#    for f in fileNames:
#        im = cv2.imread(path_ea+f,-1)
#        if im.shape[0]!=40 or im.shape[1]!=120:
#            print f
#            print im.shape
#        minr = minr if im.shape[0]>= minr else im.shape[0]
#    
#    print minr
#    
    
    labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels')
    
    
    
    testSet = fg.dataContainer(labs)
    
    
    roi=(0,37,0,115)
    roi=None
    filters = None
    meanI = None    
    #roi=(44,84,88,168)    
    
    
#    eM=np.load('/home/attale00/Desktop/mouthMask.npy')
#    m=cv2.resize(np.uint8(eM),(256,256));
#    strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3))
#    dil = cv2.dilate(m,strel)
#    
#    m=dil>0;


    components = 150
 
    X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(40,120),roi=roi)
#    X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(120,40),roi=roi,resizeFactor = .5)
# 
# perform ICA
    if mode not in ['s','v']:
        ica = FastICA(n_components=components,whiten=True)
        ica.fit(X)
        meanI=np.mean(X,axis=0)
        X1=X-meanI
        data=ica.transform(X1)
        filters=ica.components_
        
    elif mode in ['s','v']:
        W=np.load('/home/attale00/Desktop/classifiers/patches/filterMP1.npy')
        m=np.load('/home/attale00/Desktop/classifiers/patches/meanIMP1.npy')
        X1=X-m
        data=np.dot(X1,W.T)    
    
    for i in range(len(fileNames)):
            testSet.data[i].extend(data[i,:])
    #orientations = 2

    #strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3))
    #fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 5, cells_per_block=(3,3),pixels_per_cell=(24,8),maskFromAlpha=False)
    #fg.getColorHistogram(testSet,(0,40,40,80),path=path_ea,ending='.png',colorspace='lab',bins=bins)
    #fg.getImagePatchStat(testSet,path=path_ea,patchSize=(4,12))
    #fg.getImagePatchStat(testSet,path='/local/attale00/AFLW_cropped/mouth_img_error/',patchSize=(4,12))
  
    #pca
#    n_samples, n_features = X.shape
#
#    mean_ = np.mean(X, axis=0)
#    X -= mean_
#    U, S, V = linalg.svd(X)
#    explained_variance_ = (S ** 2) / n_samples
#    explained_variance_ratio_ = (explained_variance_ /explained_variance_.sum())
#    K=V / S[:, np.newaxis] * np.sqrt(n_samples)
#    filters=K[:100]
#    data=np.dot(X,filters.T)    
    
   
            
    
    testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target)
    rf=classifierUtils.standardRF(max_features = 23,min_split=15,max_depth=70,n_estimators=150)
    #rf=classifierUtils.standardRF(max_features = 5,min_split=12,max_depth=45)
    #rf = svm.NuSVC()
    #rf = linear_model.SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None)    
    if mode in ['s','v']:
        print 'Classifying with loaded classifier'
        _classifyWithOld(path,testSet,mode)
    elif mode in ['c']:
        print 'cross validation of data'
        rValues = classifierUtils.dissectedCV(rf,testSet)
        pickle.dump(rValues,open('patches_cv_ica_{}'.format(components),'w'))
    elif mode in ['save']:
        print 'saving new classifier'
        _saveRF(testSet,rf,filters=filters,meanI=meanI)
    else:
        print 'not doing anything'
示例#57
0
                                                          X_train.shape[0])
t0 = time()
# pca
# pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
# ica
pca = FastICA(n_components=n_components, whiten=True).fit(X_train)
print "done in %0.3fs" % (time() - t0)

#print 'First:'+str(pca.explained_variance_ratio_[0])
#print 'Second:'+str(pca.explained_variance_ratio_[1])

eigenfaces = pca.components_.reshape((n_components, h, w))

print "Projecting the input data on the eigenfaces orthonormal basis"
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

###############################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
def test_fastica_simple(add_noise, seed):
    # Test the FastICA algorithm on very simple data.
    rng = np.random.RandomState(seed)
    # scipy.stats uses the global RNG:
    n_samples = 1000
    # Generate two sources:
    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
    s2 = stats.t.rvs(1, size=n_samples)
    s = np.c_[s1, s2].T
    center_and_norm(s)
    s1, s2 = s

    # Mixing angle
    phi = 0.6
    mixing = np.array([[np.cos(phi), np.sin(phi)],
                       [np.sin(phi), -np.cos(phi)]])
    m = np.dot(mixing, s)

    if add_noise:
        m += 0.1 * rng.randn(2, 1000)

    center_and_norm(m)

    # function as fun arg
    def g_test(x):
        return x ** 3, (3 * x ** 2).mean(axis=-1)

    algos = ['parallel', 'deflation']
    nls = ['logcosh', 'exp', 'cube', g_test]
    whitening = [True, False]
    for algo, nl, whiten in itertools.product(algos, nls, whitening):
        if whiten:
            k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo,
                                      random_state=rng)
            assert_raises(ValueError, fastica, m.T, fun=np.tanh,
                          algorithm=algo)
        else:
            pca = PCA(n_components=2, whiten=True, random_state=rng)
            X = pca.fit_transform(m.T)
            k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False,
                                      random_state=rng)
            assert_raises(ValueError, fastica, X, fun=np.tanh,
                          algorithm=algo)
        s_ = s_.T
        # Check that the mixing model described in the docstring holds:
        if whiten:
            assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m))

        center_and_norm(s_)
        s1_, s2_ = s_
        # Check to see if the sources have been estimated
        # in the wrong order
        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
            s2_, s1_ = s_
        s1_ *= np.sign(np.dot(s1_, s1))
        s2_ *= np.sign(np.dot(s2_, s2))

        # Check that we have estimated the original sources
        if not add_noise:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2)
        else:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1)

    # Test FastICA class
    _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo,
                                random_state=seed)
    ica = FastICA(fun=nl, algorithm=algo, random_state=seed)
    sources = ica.fit_transform(m.T)
    assert_equal(ica.components_.shape, (2, 2))
    assert_equal(sources.shape, (1000, 2))

    assert_array_almost_equal(sources_fun, sources)
    assert_array_almost_equal(sources, ica.transform(m.T))

    assert_equal(ica.mixing_.shape, (2, 2))

    for fn in [np.tanh, "exp(-.5(x^2))"]:
        ica = FastICA(fun=fn, algorithm=algo)
        assert_raises(ValueError, ica.fit, m.T)

    assert_raises(TypeError, FastICA(fun=range(10)).fit, m.T)
示例#59
0
def main():
    # まずは生体情報のデータeta_data, beta_dataを取り出す -------------------------------------

    # データがあるパスに作業ディレクトリ変更
    os.chdir(DATAPATH)
    # data格納用のデータフレームを準備
    data_df = pd.DataFrame([])

    for i_sub in range(len(FILENAME_LIST)):
        # データフレームにデータ格納(このデータはすでに標準化済み)
        mean_df = pd.read_excel(FILENAME_LIST[i_sub],
                                sheet_name="mean").drop("Statistics", axis=1)
        max_df = pd.read_excel(FILENAME_LIST[i_sub],
                               sheet_name="max").drop("Statistics", axis=1)
        min_df = pd.read_excel(FILENAME_LIST[i_sub],
                               sheet_name="min").drop("Statistics", axis=1)
        std_df = pd.read_excel(FILENAME_LIST[i_sub],
                               sheet_name="std").drop("Statistics", axis=1)
        # [平均,最大,最小,標準偏差]の順に横に並べる
        df = pd.concat([
            mean_df,
            max_df.drop("Task", axis=1),
            min_df.drop("Task", axis=1),
            std_df.drop("Task", axis=1)
        ],
                       axis=1,
                       sort=False)
        # 各被験者の結果を縦に連結(ついでに標準化する)
        data_df = data_df.append(df)

    # タスク番号の削除
    data2_df = data_df.drop(["Task"], axis=1)
    data2_df.columns = COLUMNS48

    # dataを列ごとに標準化(標準化の標準化)
    stan_data = scipy.stats.zscore(data2_df, axis=0)
    # データフレーム型に変換
    stan_data_df = pd.DataFrame(stan_data, columns=COLUMNS48)

    # 粘性,剛性をそれぞれ取り出す
    eta_data_df = stan_data_df.iloc[:, [0, 6, 12, 18, 24, 30, 36, 42]]
    beta_data_df = stan_data_df.iloc[:, [1, 7, 13, 19, 25, 31, 37, 43]]
    eta_data = eta_data_df.values
    beta_data = beta_data_df.values

    # -------------------------------------------------------------------------

    # 次に主観評価のデータq_stan_dataを取り出す ------------------------------------
    os.chdir(DATAPATH2)  # データがあるパスに作業ディレクトリ変更
    # data格納用のデータフレームを準備
    q_data_df = pd.DataFrame([])
    for i_sub in range(len(FILENAME_LIST2)):
        # データフレームにデータ格納
        q_df = pd.read_excel(FILENAME_LIST2[i_sub])
        # 各被験者の結果を縦に連結(ついでに標準化する)
        q_data_df = q_data_df.append(arrange_data(q_df, i_sub))

    # タスク番号, 刺激の種類の削除
    q_data2_df = q_data_df.drop(["No", "Stimulation"], axis=1)

    # dataを列ごとに標準化(標準化の標準化)
    q_stan_data = scipy.stats.zscore(q_data2_df, axis=0)
    # -------------------------------------------------------------------------

    # 刺激の種類のndarray配列を用意
    odor = q_data_df["Stimulation"].values.tolist()
    odor = np.reshape(odor, (len(odor), 1))  # 刺激の種類

    # PCAする(返り値はndarray)
    un_score, non_score = mypca2(q_stan_data, eta_data, beta_data, odor)

    # ICAする
    # データの準備
    ica_data = np.vstack((un_score[:, [1, 2]], non_score[:, [1, 2]]))
    # FastICAで独立成分分析
    ica = FastICA()
    ica.fit(ica_data)
    Uica = ica.components_.T
    Aica = ica.transform(ica_data).T

    Uica = Uica / np.sqrt((Uica**2).sum(axis=0))

    un = Aica[:, 0:len(un_score)]
    non = Aica[:, len(un_score):]
    # グラフ描画
    plt.figure(figsize=(5, 5))
    plt.scatter(un[0],
                un[1],
                s=80,
                c=[0.4, 0.6, 0.9],
                alpha=0.8,
                linewidths="1",
                edgecolors=[0, 0, 0])
    plt.scatter(non[0],
                non[1],
                s=80,
                c=[0.5, 0.5, 0.5],
                alpha=0.8,
                linewidths="1",
                edgecolors=[0, 0, 0])
    plt.title("ICA scatter", fontsize=18)

    # グラフを表示する
    plt.tight_layout()  # タイトルの被りを防ぐ
    plt.show()

    # 主観評価のPC1と,生体情報のICA結果で相関解析
    # グラフ描画サイズを設定する
    plt.figure(figsize=(10, 5))

    # 1つ目のグラフ
    plt.subplot(1, 2, 1)
    # プロットする
    plt.scatter(un_score[:, 0],
                un[0],
                s=80,
                c=[0.4, 0.6, 0.9],
                alpha=0.8,
                linewidths="1",
                edgecolors=[0, 0, 0])
    plt.scatter(non_score[:, 0],
                non[0],
                s=80,
                c=[0.5, 0.5, 0.5],
                alpha=0.8,
                linewidths="1",
                edgecolors=[0, 0, 0])
    plt.title("scatter", fontsize=18)
    plt.xlabel("Emotion_PC1", fontsize=18)
    plt.ylabel("ICA_1", fontsize=18)
    correlation_analysis(np.hstack((un_score[:, 0], non_score[:, 0])),
                         np.hstack((un[0], non[0])))

    # 2つ目のグラフ
    plt.subplot(1, 2, 2)
    # プロットする
    plt.scatter(un_score[:, 0],
                un[1],
                s=80,
                c=[0.4, 0.6, 0.9],
                alpha=0.8,
                linewidths="1",
                edgecolors=[0, 0, 0])
    plt.scatter(non_score[:, 0],
                non[1],
                s=80,
                c=[0.5, 0.5, 0.5],
                alpha=0.8,
                linewidths="1",
                edgecolors=[0, 0, 0])
    plt.title("scatter", fontsize=18)
    plt.xlabel("Emotion_PC1", fontsize=18)
    plt.ylabel("ICA_2", fontsize=18)
    correlation_analysis(np.hstack((un_score[:, 0], non_score[:, 0])),
                         np.hstack((un[1], non[1])))

    # グラフを表示する
    plt.tight_layout()  # タイトルの被りを防ぐ
    plt.show()
    """