Пример #1
1
def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"):
    """
    Given a word2vec model and a cluster (choice of "kmeans" or "spectral")
    Make a plot of all word-vectors in the model.
    """
    X, keys = make_data_matrix(model)

    for i, key in enumerate(keys):
        X[i,] = model[key]

    if cluster == "kmeans":
        k_means = KMeans(n_clusters=8)
        labels = k_means.fit_predict(X)

    elif cluster == "spectral":
        sp_clust = SpectralClustering()
        labels = sp_clust.fit_predict(X)

    # PCA
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X_transf = sklearn_pca.fit_transform(X_std)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)

    return sklearn_pca.explained_variance_ratio_
Пример #2
0
def plot_data(data, has_label=True):
	import numpy as np
	import seaborn as sns
	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA

	if not has_label:
		data = data.copy()
		data['label'] = np.zeros([len(data),1])

	LIMIT = 4000
	if data.shape[0] > LIMIT:
		dt = data.sample(n=LIMIT, replace=False)
		X = dt.ix[:,:-1]
		labels = dt.ix[:,-1]
	else:
		X = data.ix[:,:-1]
		labels = data.ix[:,-1]

	tsne_model = TSNE(n_components=2, random_state=0)
	np.set_printoptions(suppress=True)
	points1 = tsne_model.fit_transform(X)
	df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('TNSE')

	pca = PCA(n_components=2)
	pca.fit(X)
	points2 = pca.transform(X)
	df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('PCA')
Пример #3
0
def _analyze_pca(csv_filepath):
    """
    Analyze how much data can be compressed.

    Parameters
    ----------
    csv_filepath : str
        Path relative to dataset_path to a CSV file which points to images
    """
    from sklearn.decomposition import PCA
    import itertools as it

    symbol_id2index = generate_index(csv_filepath)
    data, y = load_images(csv_filepath, symbol_id2index, one_hot=False)
    data = data.reshape(data.shape[0], data.shape[1] * data.shape[2])
    pca = PCA()
    pca.fit(data)
    sum_ = 0.0
    done_values = [None, None, None]
    done_points = [False, False, False]
    chck_points = [0.9, 0.95, 0.99]
    for counter, el in enumerate(pca.explained_variance_ratio_):
        sum_ += el
        for check_point, done, i in zip(chck_points, done_points, it.count()):
            if not done and sum_ >= check_point:
                done_points[i] = counter
                done_values[i] = sum_
    for components, variance in zip(done_points, done_values):
        print("%i components explain %0.2f of the variance" %
              (components, variance))
Пример #4
0
def fit_pca(trajs):
    print 'fitting PCA...'
    pca = PCA(2, copy=True, whiten=False)
    X = np.vstack(trajs.values())
    pca.fit(X)
    print 'done'
    return pca
Пример #5
0
    def pcafunction(dataList,countList,nameList):
        from sklearn.decomposition import PCA
        import pylab as pl

        pcadataArray = np.array(dataList)
        pcaCountArray = np.array(countList)
        pca = PCA(n_components=2)
        X = pca.fit(pcadataArray).transform(pcadataArray)
        
        pcaNameList = []
        
        for i in range(0,len(nameList)):
            if nameList[i] not in pcaNameList:
                pcaNameList.append(nameList[i])

        print('explained variance ratio (first two components): %s'
              % str(pca.explained_variance_ratio_))
        
        plt.plot(X[pcaCountArray == 0, 0], X[pcaCountArray == 0, 1], 'or',
                 X[pcaCountArray == 1, 0], X[pcaCountArray == 1, 1], '^b',
                 X[pcaCountArray == 2, 0], X[pcaCountArray == 2, 1], 'sg'
                 )
        plt.xlabel('PC1 (explained variance ratio: ' + str(pca.explained_variance_ratio_[0])+')',fontsize=14)
        plt.ylabel('PC2 (explained variance ratio: ' + str(pca.explained_variance_ratio_[1])+')',fontsize=14)
        plt.legend((str(pcaNameList[0]),str(pcaNameList[1])),loc='best',fontsize=14)
        plt.title('PCA',fontsize=16)
Пример #6
0
def add_tsne_features(x_train, x_test):
    print('add_tsne_features <<')

    x_train_data = x_train.data_
    x_test_data = x_test.data_

    x = np.vstack((x_train_data, x_test_data))

    print('applying pca...')
    pca = PCA(n_components=25)
    x_pca = pca.fit_transform(x)

    print('applying t-SNE...')
    tsne_model = TSNE(n_components=2, random_state=0)
    x_tsne = tsne_model.fit_transform(x_pca)
    x_train_data = np.hstack((x_train_data, x_tsne[:x_train_data.shape[0], :]))
    x_test_data = np.hstack((x_test_data, x_tsne[-x_test_data.shape[0]:, :]))

    assert(x_train.columns_ == x_test.columns_)
    columns = x_train.columns_ + ['tsne_1', 'tsne_2']
    x_train = DataSet(x_train.ids_, columns, x_train_data)
    x_test = DataSet(x_test.ids_, columns, x_test_data)

    print('add_tsne_features >>')
    return x_train, x_test
Пример #7
0
def test_pca():
    # PCA on dense arrays
    X = iris.data

    for n_comp in np.arange(X.shape[1]):
        pca = PCA(n_components=n_comp, svd_solver='full')

        X_r = pca.fit(X).transform(X)
        np.testing.assert_equal(X_r.shape[1], n_comp)

        X_r2 = pca.fit_transform(X)
        assert_array_almost_equal(X_r, X_r2)

        X_r = pca.transform(X)
        X_r2 = pca.fit_transform(X)
        assert_array_almost_equal(X_r, X_r2)

        # Test get_covariance and get_precision
        cov = pca.get_covariance()
        precision = pca.get_precision()
        assert_array_almost_equal(np.dot(cov, precision),
                                  np.eye(X.shape[1]), 12)

    # test explained_variance_ratio_ == 1 with all components
    pca = PCA(svd_solver='full')
    pca.fit(X)
    assert_almost_equal(pca.explained_variance_ratio_.sum(), 1.0, 3)
Пример #8
0
def compute_pca(data2d):
    """
    Compute PCA using sklearn
    :param data2d: 2d array. PCA will be computed on non-zeros values.
    :return:
        coordsrc: 2d array: centered non-zero coordinates
        pca: object: PCA result.
        centermass: 2x1 array: 2d coordinates of the center of mass
    """
    # round it and make it int (otherwise end up with values like 10-7)
    data2d = data2d.round().astype(int)
    # get non-zero coordinates, and transpose to obtain nx2 dimensions
    coordsrc = np.array(data2d.nonzero()).T
    # get center of mass
    centermass = coordsrc.mean(0)
    # center data
    coordsrc = coordsrc - centermass
    # normalize data
    coordsrc /= coordsrc.std()
    # Performs PCA
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2, copy=False, whiten=False)
    pca.fit(coordsrc)
    # pca_score = pca.explained_variance_ratio_
    # V = pca.components_
    return coordsrc, pca, centermass
Пример #9
0
def make_tsne_plot(model, rel_wds, plot_lims, title):

    dim = 30
    X, keys = make_data_matrix(model)

    # first we actually do PCA to reduce the
    # dimensionality to make tSNE easier to calculate
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X = sklearn_pca.fit_transform(X_std)[:,:dim]

    # do downsample
    k = 5000
    sample = []
    important_words = []
    r_wds = [word[0] for word in rel_wds]
    for i, key in enumerate(keys):
        if key in r_wds:
            sample.append(i)
    sample = np.concatenate((np.array(sample),
                np.random.choice(len(keys), k-10, replace = False),
             ))
    X = X[sample,:]
    keys = [keys[i] for i in sample]



    # Do tSNE
    tsne = TSNE(n_components=2, random_state=0, metric="cosine")
    X_transf = tsne.fit_transform(X)

    k_means = KMeans(n_clusters=8)
    labels = k_means.fit_predict(X_transf)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)
Пример #10
0
def pca(df, n_components=2, mean_center=False, *args, **kwargs):
    if not sklearn:
        assert('This library depends on scikit-learn (sklearn) to perform PCA analysis')
        
    from sklearn.decomposition import PCA

    df = df.copy()
    
    # We have to zero fill, nan errors in PCA
    df[ np.isnan(df) ] = 0

    if mean_center:
        mean = np.mean(df.values, axis=0)
        df = df - mean

    pca = PCA(n_components=n_components, *args, **kwargs)
    pca.fit(df.values.T)

    scores = pd.DataFrame(pca.transform(df.values.T)).T
    scores.index =  ['Principal Component %d' % (n+1) for n in range(0, scores.shape[0])]
    scores.columns = df.columns

    weights = pd.DataFrame(pca.components_).T
    weights.index = df.index
    weights.columns =  ['Weights on Principal Component %d' % (n+1) for n in range(0, weights.shape[1])]
       
    return scores, weights
Пример #11
0
    def fit(self, x, y, i=0):
        # if gaussian processes are being used, data dimensionality needs to be reduced before fitting
        if self.method[i] == 'GP':
            if self.reduce_dim == 'FastICA':
                print('Reducing dimensionality with ICA')
                do_ica = FastICA(n_components=self.n_components)
                self.do_reduce_dim = do_ica.fit(x)
            if self.reduce_dim == 'PCA':
                print('Reducing dimensionality with PCA')
                do_pca = PCA(n_components=self.n_components)
                self.do_reduce_dim = do_pca.fit(x)

            x = self.do_reduce_dim.transform(x)
        #try:
            print('Training model...')
        try:
            self.model.fit(x, y)
            self.goodfit = True
            print(self.model)
        except:
            self.goodfit = False
            if self.method[i] == 'GP':
                print('Model failed to train! (For GP this does not always indicate a problem, especially for low numbers of components.)')
                pass
            else:
                print('Model failed to train!')
                traceback.print_stack()

        if self.ransac:
            self.outliers = np.logical_not(self.model.inlier_mask_)
            print(str(np.sum(self.outliers)) + ' outliers removed with RANSAC')
Пример #12
0
def main():
	
	inp=np.loadtxt('../../out_files/bivar_regress.txt', usecols=(1, 2, 3))

	X=inp[:,[1, 2]]
	
	ncomp=int(sys.argv[3])
	
	pca=PCA(n_components=ncomp)

	pca.fit(X)
	
	l=pca.transform(X)
	print "Doing an \t"+str(ncomp)+"\t component PCA \n\n----------------"
	
	#linear regression fit
	res=sm.OLS(inp[:,0], l).fit()
	
	t2_new=float(sys.argv[1])
	err_t2_new=float(sys.argv[2])
		
	#array for 1000 realisations with slope and slope error -0.0264 and 0.004
	ar=np.array([(rn(-0.0264, 0.004)*rn(pca.transform([rn(t2_new, err_t2_new)]), 0.85)+rn(np.mean(inp[:,0]), 0.07))/rn(2.0, 0.3) for k in range(1000)])
	
	print "The estimated L_max is\t "+ str(np.mean(ar)) 
	print "The error from the PCA is\t "+ str(np.std(ar))
	print  "Standard error on y mean is \t "+ str(np.std(inp[:,0])/np.sqrt(len(inp[:,0])))
	print "Error by bootstrapping is \t"+ str(np.std(boots(inp[:,0])))
class Transformer:
    def __init__(self, use_PCA=True):
        self._clf = DecisionTreeClassifier(min_samples_leaf=10)
        self._idx = None
        self._scaler = StandardScaler()
        self._trans = PCA('mle')
        self._use_PCA = use_PCA

    def fit(self, X, y):
        X = np.array(X)
        self._clf.fit(X, y)

        self._idx = filter(lambda x: self._clf.feature_importances_[x] > 0, \
                range(len(self._clf.feature_importances_)))

        new_set = [X[i][self._idx] for i in xrange(len(X))]

#        new_set = self._scaler.fit_transform(new_set)

        if self._use_PCA:
            new_set = self._trans.fit_transform(new_set)
        return new_set

    def transform(self, features):
        features = features[self._idx]
#        features = self._scaler.transform(features.astype(float))
        if self._use_PCA:
            features = self._trans.transform(features)
        return features
Пример #14
0
    def __init__(self):
        super(RegressionDriver, self).__init__()

        if REGRESSOR == "LOG":
            self.driver = LogisticRegression()
        elif REGRESSOR == "RFR":
            self.driver = RandomForestRegressor(n_estimators=N_ESTIMATORS, n_jobs=N_JOBS)
        elif REGRESSOR == "GBR":
            self.driver = GradientBoostingClassifier(n_estimators=300, max_depth=5, learning_rate=0.05)
        elif REGRESSOR == "PCA":
            self.driver = PCA(n_components=1)
        else:
            raise Exception("Regressor: %s not supported." % REGRESSOR)

        genuineX = []
        forgeryX = []

        genuineY = []
        forgeryY = []

        # Training process
        for sigs in self.train_set:
            personTrain = PersonTraining(sigs)
            genuine, forgery = personTrain.calc_train_set()
            genuineX.extend(genuine)
            forgeryX.extend(forgery)

        # To adjust PCA result, 0 means genuine and 1 means forgery
        genuineY = [0.0] * len(genuineX)
        forgeryY = [1.0] * len(forgeryX)

        trainX = genuineX + forgeryX
        trainY = genuineY + forgeryY

        self.driver.fit(trainX, trainY)
Пример #15
0
def load_bipolar_cells(micronsPerDeg=50.):
    ''' Returns list of tuples (space, spatial receptive field)
    '''

    data_path, this_filename = os.path.split(__file__)
    file_name1 = data_path + '/data/B1.txt'
    file_name2 = data_path + '/data/B2.txt'
    data_b1    = np.loadtxt(file_name1, delimiter="\t") # 50 time x 100 space
    data_b2    = np.loadtxt(file_name2, delimiter="\t") # 50 time x 100 space
    data_b     = [data_b1, data_b2]

    # get spacing for all bipolar spatial receptive fields
    spatialDelta = 0.022 # mm

    # since receptive fields are noisy, use PCA
    spatial_rfs = []
    for b in data_b:
        pca = PCA(n_components=2)
        pca.fit(b)

        b_pca      = pca.components_[0]
        sign_of_pc = -1 * np.sign(b_pca[abs(b_pca) == np.max(abs(b_pca))])
        space      = get_space(b_pca, spatialDelta, micronsPerDeg)

        spatial_rfs.append((space, sign_of_pc * b_pca))

    return spatial_rfs
Пример #16
0
def Ploting3D(data, n_dimension=3):
    pca = PCA(n_components = n_dimension)
    colors = ['r', 'g', 'b', 'm']
    labels = ['label_1', 'label_2', 'label_3', 'label_4']
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    idx = [0, len(data[0])]
    combined = np.array(data[0])

    # Combined all data
    for i in xrange(1, len(data)):
        combined = np.insert(combined, len(combined), data[i], axis=0)
        idx.append(idx[i]+len(data[i]))

    combined = pca.fit_transform(combined)

    for i in xrange(len(data)):
        ax.scatter(combined[idx[i]:idx[i+1], 0], combined[idx[i]:idx[i+1], 1], combined[idx[i]:idx[i+1], 2], c=colors[i], marker='o', s=70)


    ax.set_xlabel('1st_component')
    ax.set_ylabel('2nd_component')
    ax.set_zlabel('3rd_component')

    ax.set_xlim3d(-100, 100)
    ax.set_ylim3d(-60, 50)
    ax.set_zlim3d(-60, 50)

    plt.show()
Пример #17
0
 def pc_analysis(self):
     pca_result = {}
     pca = PCA(n_components=2)
     pca_result['result'] = self.pca_reduced = pca.fit_transform(self.player_stats, self.player_value)
     pca_result['ratios'] = pca.explained_variance_ratio_
     pca_result['components'] = pca.components_
     return pca_result
Пример #18
0
def load_ganglion_cells(micronsPerDeg=50., pca_mode='space'):
    ''' Returns list of tuples (space, spatial receptive field)
    '''

    data_path, this_filename = os.path.split(__file__)
    filename = data_path + '/data/allGC.txt'
    data_gc   = np.loadtxt(filename, delimiter="\t")
    data_gc   = data_gc.reshape((100,80,28))
    nCells    = data_gc.shape[2]

    # get spacing for spatial receptive fields
    spatialDelta = 0.027 # mm

    # since receptive fields are noisy, use PCA
    spatial_rfs = []
    for n in range(nCells):
        pca = PCA(n_components=2)
        if pca_mode == 'space':
            pca.fit(data_gc[:,:,n])
            g_pca = pca.components_[0]
        elif pca_mode == 'time':
            pca.fit(data_gc[:,:,n].T)
            g_pca = np.dot(data_gc[:,:,n].T, pca.components_[0])


        sign_of_pc = -1 * np.sign(g_pca[abs(g_pca) == np.max(abs(g_pca))])
        space      = get_space(g_pca, spatialDelta, micronsPerDeg)

        spatial_rfs.append((space, sign_of_pc * g_pca))

    return spatial_rfs
Пример #19
0
def main():
  x = 10
  y = 10
  steps = 10000
  history = []
  world = np.array([i for i in xrange(625)])
  world.resize((25, 25))
  for _ in xrange(steps):
    active = getActive(world, x, y)
    assert len(active) == 25, "{}, {}: {}".format(x, y, active)
    history.append(active)
    x, y = getNewLocation(x, y, 25, 2, False)
  correlation = computeCorrelation(history)

  #plt.imshow(correlation, cmap="hot", interpolation="nearest")
  #plt.show()

  pca = PCA(n_components=25)
  pca.fit(correlation)
  print 'components'
  print pca.components_
  #negativeMask = (pca.components_ < 0)
  #pca.components_[negativeMask] = 0
  print 'transform:'
  transform = pca.transform(correlation)
  #negativeMask = (transform < 0)
  #transform[negativeMask] = 0
  print transform.shape

  for i in xrange(25):
    plt.imshow(transform[:,i].reshape((25, 25)), cmap="hot", interpolation="nearest")
    plt.show()
Пример #20
0
def pca(inF,MIN):
    df = pd.read_table(inF, header=0)
    dc = list(df.columns)
    dc[0]='GeneID'
    df.columns = dc
    
    print(df.shape)
    sel = True 
    for i in range(4, df.shape[1]-1):
        sel = (df.ix[:,i] < MIN) & (df.ix[:,i+1]< MIN)
    df = df.ix[~sel,:]
    print(df.shape)
    
    X = df.ix[:,4:df.shape[1]].values.T
    y = df.columns[4:df.shape[1]].values
    X_std = StandardScaler().fit_transform(X)
    
    #pca = PCA(n_components=2)
    pca = PCA()
    Y_sklearn = pca.fit_transform(X_std)
    
    
    fig = plt.figure()
    plt.style.use('ggplot')
    #plt.style.use('seaborn-whitegrid')
    ax = fig.add_subplot(111)
    for lab, col in zip(y,['r','g','b','c'] + sns.color_palette("cubehelix", df.shape[1]-4-4)):
        ax.scatter(Y_sklearn[y==lab, 0],Y_sklearn[y==lab, 1],label=lab,c=col, s=80)
    
    
    ax.set_xlabel('Principal Component 1 : %.2f'%(pca.explained_variance_ratio_[0]*100) + '%')
    ax.set_ylabel('Principal Component 2 : %.2f'%(pca.explained_variance_ratio_[1]*100) + '%')
    ax.legend(loc='upper right', prop={'size':8})
    plt.tight_layout()
    plt.savefig(inF + '-MIN' + str(MIN) + '.pdf')
Пример #21
0
def plot_2d_results(X, y, preds):
    pca = PCA(n_components=2)
    X_r = pca.fit(X).transform(X)

    # Plot scatter
    plt.figure()
    cs = "cm"
    cats = [1, -1]
    target_names = ["positive", "negative"]
    for c, i, target_name in zip(cs, cats, target_names):
        plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
    plt.legend()
    plt.title("PCA of 2d data")
    plt.savefig("figures/data-scatter.png")

    # Plot mispredictions
    plt.figure()
    diff = np.array([1 if y_test[i] == preds[i] else 0 for i in range(len(y_test))])
    cs = "rg"
    cats = [0, 1]
    target_names = ["incorrect", "correct"]
    for c, i, target_name in zip(cs, cats, target_names):
        plt.scatter(X_r[diff == i, 0], X_r[diff == i, 1], c=c, label=target_name)
        plt.legend()
        plt.title("PCA of correct/incorrect predictions")
    # plt.show()
    plt.savefig("figures/residual-scatter.png")
def pca_variance(df):  # inputs are original data frame
    df_pca = PCA()
    df_pca.fit(df)
    ratio = df_pca.explained_variance_ratio_
    components = [('component'+str(x)) for x in range(1, (df.shape[1]+1))]
    df2 = pd.Series(ratio, index = components)
    return df2
Пример #23
0
def classification_level_SGDReg_pipeline(classifications_DF):
   X = classifications_DF.iloc[:,3:89]
   #assign the target (session length) to y and convert to int
   y_actual = classifications_DF.iloc[:,2:3].astype(float)

   #scaling the data for feature selection
   X_scaled = preprocessing.scale(X)

   X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.5, random_state=0)

   pca_selection = PCA(n_components=2)

   X_features = pca_selection.fit(X_scaled_train['session_length'].values).transform(X_scaled_train)

   SGDReg = SGDRegressor(alpha=0.0001)

   # Do grid search over k, n_components and SVR parameters:
   pipeline = Pipeline([('pca', pca_selection),('SGDReg',SGDReg)])

   tuned_params = dict(pca__n_components=[5,30,40,50],
                     SGDReg__alpha=[0.1,0.01,0.001,0.0001,0.00001],
                     SGDReg__l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1],
                     SGDReg__penalty=['l2','l1','elasticnet'])

   grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10)
   grid_search.fit(X_scaled_train, y_actual_train['session_length'].values)
   print(grid_search.best_estimator_)
   y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test)
   print "Mean squared error:"+str(mean_squared_error(y_true,y_pred))
   pd.DataFrame(y_true, y_pred).to_csv("SGDReg_pred_true.csv")
Пример #24
0
def sentence_to_vec(sentence_list: List[Sentence], embedding_size: int, a: float=1e-3):
    sentence_set = []
    for sentence in sentence_list:
        vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
        sentence_length = sentence.len()
        for word in sentence.word_list:
            a_value = a / (a + get_word_frequency(word.text))  # smooth inverse frequency, SIF
            vs = np.add(vs, np.multiply(a_value, word.vector))  # vs += sif * word_vector

        vs = np.divide(vs, sentence_length)  # weighted average
        sentence_set.append(vs)  # add to our existing re-calculated set of sentences

    # calculate PCA of this sentence set
    pca = PCA(n_components=embedding_size)
    pca.fit(np.array(sentence_set))
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT

    # pad the vector?  (occurs if we have less sentences than embeddings_size)
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below

    # resulting sentence vectors, vs = vs -u x uT x vs
    sentence_vecs = []
    for vs in sentence_set:
        sub = np.multiply(u,vs)
        sentence_vecs.append(np.subtract(vs, sub))

    return sentence_vecs
Пример #25
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = PCA(n_components=2, svd_solver='randomized', random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Пример #26
0
def pca(inF,MIN):
    df = pd.read_table(inF, header=0)
    dc = list(df.columns)
    dc[0]='GeneID'
    df.columns = dc
    
    print(df.shape)
    sel = ~((df.ix[:,2] < MIN) & (df.ix[:,3]< MIN) & (df.ix[:,4]< MIN) & (df.ix[:,5]< MIN) & (df.ix[:,6]< MIN) & (df.ix[:,7]< MIN) & (df.ix[:,8]< MIN) & (df.ix[:,9]< MIN))
    df = df.ix[sel,:]
    print(df.shape)
    
    X = df.ix[:,2:df.shape[1]].values.T
    y = df.columns[2:df.shape[1]].values
    X_std = StandardScaler().fit_transform(X)
    
    #pca = PCA(n_components=2)
    pca = PCA()
    Y_sklearn = pca.fit_transform(X_std)
    
    
    fig = plt.figure()
    plt.style.use('ggplot')
    #plt.style.use('seaborn-whitegrid')
    ax = fig.add_subplot(111)
    for lab, col in zip(y,('red','red', 'green','green', 'blue','blue','m','m')):
        ax.scatter(Y_sklearn[y==lab, 0],Y_sklearn[y==lab, 1],label=lab,c=col, s=80)
    
    
    ax.set_xlabel('Principal Component 1 : %.2f'%(pca.explained_variance_ratio_[0]*100) + '%')
    ax.set_ylabel('Principal Component 2 : %.2f'%(pca.explained_variance_ratio_[1]*100) + '%')
    ax.legend(loc='lower right', prop={'size':8})
    plt.tight_layout()
    plt.savefig(inF + '-RNASeq-MIN' + str(MIN) + '.pdf')
def reduced_dimension(posture):
    i_user = 1
    session = 1
    while i_user <= 31:
        currentdirectory = os.getcwd()  # get the directory.
        parentdirectory = os.path.abspath(currentdirectory + "/../..")  # Get the parent directory(2 levels up)
        path = parentdirectory + '\Output Files\Reduced Dimensional Dataset/posture-'+str(posture)+'/GenuineUser'+str(i_user)+''
        if not os.path.exists(path):
            os.makedirs(path)

        while session <= 8:
            data = np.genfromtxt("../../Output Files/E2-Genuine User-Session Split/Posture-"+str(posture)+"/GenuineUser-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(session)+".csv", dtype=float, delimiter=",")

            userinformation = data[:, [0,1,2,3,4]]
            sample_train = data[:, [5,6,7,8,9,10,11,13,15,16,17,18,19,20,21]]
            scaler = preprocessing.MinMaxScaler().fit(sample_train)
            sample_train_scaled = scaler.transform(sample_train)

            pca = PCA(n_components=7)
            sample_train_pca = pca.fit(sample_train_scaled).transform(sample_train_scaled)

            completedata = np.column_stack((userinformation, sample_train_pca))


            np.savetxt("../../Output Files/Reduced Dimensional Dataset/Posture-"+str(posture)+"/GenuineUser"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(session)+".csv", completedata, delimiter=',')

            session += 1
        session = 1
        i_user += 1
def pca_project(vecs, n_components=2, whiten=False):
    pca = PCA(n_components=n_components)
    vecs_projected = pca.fit_transform(vecs)
    print "=== PCA projection ==="
    print pca.explained_variance_ratio_
    print "choosen explained: %.2f" % np.sum(pca.explained_variance_ratio_[:n_components])
    return vecs_projected
Пример #29
0
def feature_scaled_nn_acc(mds, type):
    train, validation = validation_split(mds)
    # Multiply by 1 to convert to bool
    y_train = train['Up'] * 1
    X_train = train.drop('Up', axis=1)
    y_validation = validation['Up'] * 1
    X_validation = validation.drop('Up', axis=1)
    pre = PCA(n_components=19, whiten=True)
    X_train_pca = pre.fit_transform(X_train)
    X_validation_pca = pre.fit_transform(X_validation)
    model = create_model(X_train_pca.shape[1], type)
    # Convert to Keras format
    y_train = to_categorical(y_train.values)
    y_validation = to_categorical(y_validation.values)
    model.fit(X_train_pca, y_train, nb_epoch=5, batch_size=16)
    time.sleep(0.1)
    # Fit and guess
    guess_train = model.predict_classes(X_train_pca)
    guess_train = to_categorical(guess_train)

    guess_validation = model.predict_classes(X_validation_pca)
    guess_validation = to_categorical(guess_validation)

    train_acc = accuracy_score(y_train, guess_train)
    validation_acc = accuracy_score(y_validation, guess_validation)
    print "\n neural net train accuracy is {}".format(train_acc)
    print "\n neural net validation accuracy is {}".format(validation_acc)
    return guess_validation
Пример #30
0
def cluster_kmeans():
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    # import sklearn.decomposition.pca

    limit = 10000
    # X,real_labels=data_dict.get_training_set()
    filepath = '/home/wenjusun/bigdata/data/adult-income/adult.data'
    record_list = data_parser.parse_file_fetch_records(filepath, limit)
    X = np.array(data_parser.records_to_vector(record_list, enable_label=False))

    pca_estimator = PCA(n_components=1)

    X=pca_estimator.fit_transform(X)

    kmeans_model = KMeans(n_clusters=4).fit(X)
    labels = kmeans_model.labels_
    # print kmeans_model.cluster_centers_
    # print labels[:100]
    print len(X),len(labels)
    print labels[:40]
    # print array(real_labels)

    # count=0
    # for xLabel,eLabel in zip(X[-1],labels):
    #     if xLabel==eLabel:
    #         count +=1
    #
    # print "count=%d,ratio:%f" %(count,1.0*count/len(labels))
    # print np.sum(labels)
    plt.figure(1)
    plt.scatter(X,labels)
    plt.show()
Пример #31
0
                 n_init=10,
                 random_state=0)

# create 'cluster' column
matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[1:]])
matrix.head()
# Code ends here

# --------------
# import packages
from sklearn.decomposition import PCA

# Code starts here

# initialize pca object with 2 components
pca = PCA(n_components=2, random_state=0)

# create 'x' and 'y' columns donoting observation locations in decomposed form
matrix['x'] = pca.fit_transform(matrix[matrix.columns[1:]])[:, 0]
matrix['y'] = pca.fit_transform(matrix[matrix.columns[1:]])[:, 1]
# dataframe to visualize clusters by customer names
clusters = matrix.iloc[:, [0, 33, 34, 35]]

# visualize clusters
clusters.plot.scatter(x='x', y='y', c='cluster', colormap='viridis')

# Code ends here

# --------------
# Code starts here
Пример #32
0
    # print(f"x.shape:{x.shape}")
    # print(f"y.shape:{y.shape}")

    #1. 데이터 입력

    x_train,x_test,y_train,y_test=tts(x,y,train_size=0.8)

    # scale
    scale = StandardScaler()
    x_train = scale.fit_transform(x_train)
    x_test = scale.transform(x_test)

    #PCA

    pca = PCA(pca_n)
    x_train=pca.fit_transform(x_train)
    x_test=pca.transform(x_test)


    #다중분류니까, to_categorical

    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test)

    # print(f"x_train.shape:{x_train.shape}")
    # print(f"y_train.shape:{y_train.shape}")
    
    
    
    
Пример #33
0
clf = get_classifier(classifier_name, params)

# Classification
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1234)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

st.write("Classifier: {}".format(classifier_name))
st.write("Accuracy = {}".format(acc))

# Plot
pca = PCA(2)
X_projected = pca.fit_transform(X)

x1 = X_projected[:, 0]
x2 = X_projected[:, 1]

fig = plt.figure()
plt.scatter(x1, x2, c=y, alpha=0.8, cmap="viridis")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar()

st.pyplot(fig)
Пример #34
0
        if random.random() < percent_training:
            training_labels.append(sample['label'])
            training_continuous.append(sample['continuous_features'])
            training_discrete.append(sample['discrete_features'])
            if sample['label']:
                percent_training_popular += 1
        else:
            testing_labels.append(sample['label'])
            testing_continuous.append(sample['continuous_features'])
            testing_discrete.append(sample['discrete_features'])

    percent_training_popular /= float(len(training_labels))

    training_continuous = np.array(training_continuous)
    testing_continuous = np.array(testing_continuous)
    pca = PCA(n_components=num_components)
    pca.fit(training_continuous)
    training_continuous = training_continuous.dot(np.transpose(
        pca.components_))
    testing_continuous = testing_continuous.dot(np.transpose(pca.components_))

    # pprint.pprint(pca.components_)

    cont_nb = GaussianNB()
    disc_nb = MultinomialNB()

    cont_nb = cont_nb.fit(training_continuous, training_labels)
    disc_nb = disc_nb.fit(training_discrete, training_labels)
    # Combine discrete and continuous by multiplying and handling double prior

    cont_pred = cont_nb.predict_proba(testing_continuous)
Пример #35
0
print(peak)
evoked.plot(window_title="Evoked")



####################PCA & ICA######################

from mne.decoding import UnsupervisedSpatialFilter
from sklearn.decomposition import PCA, FastICA
import matplotlib.pyplot as plt
X = epochs.get_data()


#the number of channels == 30
print("==============PCA==================")
pca = UnsupervisedSpatialFilter(PCA(30), average=False)
pca_data = pca.fit_transform(X)
ev = mne.EvokedArray(np.mean(pca_data, axis=0),
                     mne.create_info(30, epochs.info['sfreq'],
                                     ch_types='eeg'), tmin=tmin)
ev.plot(show=False, window_title="PCA")



print("==============ICA==================")
ica = UnsupervisedSpatialFilter(FastICA(30), average=False)
ica_data = ica.fit_transform(X)
ev1 = mne.EvokedArray(np.mean(ica_data, axis=0),
                      mne.create_info(30, epochs.info['sfreq'],
                                      ch_types='eeg'), tmin=tmin)
ev1.plot(show=False, window_title='ICA')
def main_function(testCaseNumber):
    t1 = cv2.getTickCount()

    #Defining constants
    basePath = "./Images/"
    print "Example Number : ", testCaseNumber
    tNo = "1"
    pNo = "2"
    testCaseNumber = str(testCaseNumber)
    trainingImagePath = basePath + testCaseNumber + "/" + tNo + ".jpg"
    grayscaleImagePath = basePath + testCaseNumber + "/" + pNo + "G.jpg"
    outputImagePath = basePath + testCaseNumber + "/output.jpg"
    k = 5
    try:
        os.stat("./../temp/" + testCaseNumber + "/")
    except:
        os.mkdir("./../temp/" + testCaseNumber + "/")
    #Reading Training Image
    trainingImage = cv2.imread(trainingImagePath)
    trainingImage = cv2.cvtColor(trainingImage, cv2.COLOR_BGR2LAB)
    m, n, _ = trainingImage.shape
    print "Color Quantization : "
    #Preprocessing variable from image
    l = trainingImage[:, :, 0]
    a = trainingImage[:, :, 1]
    b = trainingImage[:, :, 2]

    scaler = preprocessing.MinMaxScaler()
    pca = PCA(32)

    qab, centroid = quantization(a, b, k)
    print centroid
    # with open('./../temp/'+testCaseNumber+'/centroids', 'w') as csvfile:
    # 	writer = csv.writer(csvfile)
    # 	[writer.writerow(r) for r in centroid]

    t2 = cv2.getTickCount()
    t = (t2 - t1) / cv2.getTickFrequency()
    print "Time for quantization : ", t, " seconds"

    print "Feature extraction : "
    feat, classes = getKeyPointFeatures(l, qab)
    print "Length of feature descriptor before PCA : ", len(feat[0])
    feat = scaler.fit_transform(feat)
    feat = pca.fit_transform(feat)
    print "Length of feature descriptor after PCA : ", len(feat[0])

    t3 = cv2.getTickCount()
    t = (t3 - t2) / cv2.getTickFrequency()
    print "Time for feature extraction : ", t, " seconds"

    print "Training : "
    svm_classifier = train(feat, classes, k)
    t4 = cv2.getTickCount()
    t = (t4 - t3) / cv2.getTickFrequency()
    print "Time for training: ", t, " seconds"

    print "Prediction : "
    grayscaleImage = cv2.imread(grayscaleImagePath, 0)
    outputImage, probabilityValues = predict(svm_classifier, grayscaleImage,
                                             centroid, scaler, pca)
    #Writing temporary objects to disk
    #Remove later
    #cv2.imwrite("./../temp/"+testCaseNumber+"/labTempOut.jpg",outputImage)
    #outputTempImageBGR = cv2.cvtColor(outputImage,cv2.COLOR_LAB2BGR)
    #cv2.imwrite("./../temp/"+testCaseNumber+"/BGRTempOut.jpg",outputTempImageBGR)
    #with open('./../temp/'+testCaseNumber+'/probVal', 'w') as csvfile:
    #	writer = csv.writer(csvfile)
    #	[writer.writerow(r) for r in probabilityValues]

    outputImage = postProcess(outputImage, centroid, probabilityValues)

    t5 = cv2.getTickCount()
    t = (t5 - t4) / cv2.getTickFrequency()
    print "Time for prediction : ", t, " seconds"
    t = (t5 - t1) / cv2.getTickFrequency()
    print "Total time : ", t, " seconds"
    outputImage = cv2.cvtColor(outputImage, cv2.COLOR_LAB2BGR)
    trainingImage = cv2.cvtColor(trainingImage, cv2.COLOR_LAB2BGR)
    cv2.imwrite(outputImagePath, outputImage)
    cv2.imshow("Training", trainingImage)
    cv2.imshow("Original", grayscaleImage)
    cv2.imshow("Predicted", outputImage)
    cv2.waitKey()
    cv2.destroyAllWindows()
Пример #37
0
#r,g,bのベクトルにして1000個のデータをpcaする

import os
import cv2
import numpy as np
from keras.models import load_model
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from keras.layers import Activation,Conv2D,Dense,Flatten,MaxPooling2D,Dropout
from keras.models import Sequential,load_model
from keras.utils.np_utils import to_categorical



pca=PCA(n_components=48)


img_file_name_list=os.listdir("./face_scratch_image/")
print(len(img_file_name_list))

for i in range(len(img_file_name_list)):
    n=os.path.join("./face_scratch_image",img_file_name_list[i])
    img=cv2.imread(n)
    if isinstance(img,type(None))==True:
        img_file_name_list.pop(i)
        continue
#print(img_file_name_list[0:2])


X_train=np.array([])
mushrooms = mushrooms[:150]

tree = CobwebTree()
mushrooms_no_class = [{a: mushroom[a] for a in mushroom
                       if a != 'classification'} for mushroom in mushrooms]

clusters = next(cluster(tree, mushrooms_no_class))

mushroom_class = [mushroom[a] for mushroom in mushrooms for a in mushroom
                  if a == 'classification']
ari = adjusted_rand_score(clusters, mushroom_class)

dv = DictVectorizer(sparse=False)
mushroom_X = dv.fit_transform(mushrooms_no_class)

pca = PCA(n_components=2)
mushroom_2d_x = pca.fit_transform(mushroom_X)

colors = ['b', 'g', 'r', 'y', 'k', 'c', 'm']
clust_set = {v: i for i, v in enumerate(list(set(clusters)))}
class_set = {v: i for i, v in enumerate(list(set(mushroom_class)))}

for class_idx, class_label in enumerate(class_set):
    x = [v[0] for i, v in enumerate(
        mushroom_2d_x) if mushroom_class[i] == class_label]
    y = [v[1] for i, v in enumerate(
        mushroom_2d_x) if mushroom_class[i] == class_label]
    c = [colors[clust_set[clusters[i]]] for i, v in enumerate(mushroom_2d_x) if
         mushroom_class[i] == class_label]
    plt.scatter(x, y, color=c, marker=r"$ {} $".format(
        class_label[0]), label=class_label)
 def model_selection(self):
     """
     hyperparameter tuning is performed using GridSearchCV
     technique uses cross-validation when applying the default values of a 5-fold cross validation 
     as a means of splitting the training data into a training and validation sets.
     model score is representen with the R-squared metrics
     """
     models = []
     models_1 = ["Ridge","Lasso","LinearRegression","PoissonRegressor"]
     models_2 = ["RandomForestRegressor","GradientBoostingRegressor"]
     model_3 = ["SVR"]
     models += models_1 + models_2 + model_3
     models_dictionary = {"Ridge":Ridge(),"Lasso":Lasso(),"LinearRegression":LinearRegression(fit_intercept=True),
                          "RandomForestRegressor":RandomForestRegressor(random_state=0),"GradientBoostingRegressor":GradientBoostingRegressor(random_state=0),
                         "SVR":SVR(epsilon=0.5),"PoissonRegressor":PoissonRegressor(max_iter=200)}
     models_score = {}
     
     
     # Tuning of parameters for regression by cross-validation
                 # Number of cross valiations is 5
     
     for model in models:
         if model in models_1:
             
             pipe = Pipeline([
             ('scaler', StandardScaler()),
             ('reduce_dim', PCA()),
             ('regressor', models_dictionary[model])
             ])
             pipe = pipe.fit(self.X_train, self.y_train)
             n_features_to_test = np.arange(1, 13)
             alpha_to_test = 2.0**np.arange(-6, +6)
         
             if model == "LinearRegression":
                 params = {'reduce_dim__n_components': n_features_to_test,
                 'scaler' : [StandardScaler(), RobustScaler()]}
             else:
                 params = {'reduce_dim__n_components': n_features_to_test,
                 'regressor__alpha': alpha_to_test,
                 'scaler' : [StandardScaler(), RobustScaler()]}
             gridsearch = GridSearchCV(pipe, params, verbose=1).fit(self.X_train, self.y_train)
             
         elif model in models_2:
             
             if model == "RandomForestRegressor":
               
                 
                 model_estimator =models_dictionary[model]
                 params={'n_estimators':[20,30,40,60,80,100], 'max_depth': 
                 [5,10,15,20],'max_features':[2,5,8]}
                 
                  
             else:
                 model_estimator =  models_dictionary[model]
                 
                 params = {'learning_rate': [0.01,0.02,0.03,0.04],
                 'subsample'    : [0.9, 0.5, 0.2, 0.1],
                 'n_estimators' : [20,30,40,60,80,100],
                 'max_depth'    : [4,6,8,10]
                  }
             
             gridsearch = GridSearchCV(estimator = model_estimator,param_grid = params,n_jobs=-1).fit(self.X_train, self.y_train)
         else:
             parameters = {'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],'C': [1, 2.5, 5,7.5,10,15]}
             gridsearch = GridSearchCV(models_dictionary[model], parameters).fit(self.X_train, self.y_train)
          
         print(" Results from Grid Search:",model)
         print("\n The best estimator across ALL searched params:\n",gridsearch.best_estimator_)
         print("\n The best score across ALL searched params:\n",gridsearch.best_score_)
         print("\n The best parameters across ALL searched params:\n",gridsearch.best_params_)
         print('\n Final score is: ', gridsearch.score(self.X_test, self.y_test))
         print("")
         models_score[model] = gridsearch.score(self.X_test, self.y_test)
     self.models_score = models_score
	ax.set_title(modelname, fontsize = 20)         #title
	targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
	colors = ['r', 'g', 'b']
	for target, color in zip(targets,colors): #new iterator with both variables combined (random)
		indicesToKeep = finalDf['target'] == target
		ax.scatter(finalDf.loc[indicesToKeep,'component 1'], finalDf.loc[indicesToKeep, 'component 2'] , c = color , s = 20)
	ax.legend(targets)
	ax.grid()

df.corr()#correlation

##PCA (linear)
#Transform higher-dimensional set of features that could be possibly correlated 
#into a lower-dimensional set of linearly uncorrelated features.
from sklearn.decomposition import PCA
pca = PCA(n_components=2)#,whiten=True,random_state=20
#n_components - Number of components to keep
#When True (False by default) the components_ vectors are multiplied by the square root of n_samples 
#and then divided by the singular values to ensure uncorrelated outputs with unit component-wise 
#variances.
pc = pca.fit_transform(x)
plotgraph('PCA',pc)


#Nonlinear kernelPCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
#kernel=linear,rbf(radial basis function),sigmoid,cosine
xkpca = kpca.fit_transform(x)
plotgraph('kernel pca',xkpca)
Пример #41
0
def runSVM():
    # scale
    test_size = 0.4
    C = 1.0
    kernel = 'rbf'
    cancer_dim = 10
    dna_dim = 180
    for scale in params_svm['scale']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_scale.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_scale.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start))
    
    scale = False
    # test_size
    C = 1.0
    kernel = 'rbf'
    cancer_dim = 10
    dna_dim = 180
    for test_size in params_svm['test_size']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_test_size.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_test_size.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    # C
    kernel = 'rbf'
    cancer_dim = 10
    dna_dim = 180
    for C in params_svm['C']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_C.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_C.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    C = 1.0
    # kernel
    cancer_dim = 10
    dna_dim = 180
    for kernel in params_svm['kernel']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_kernel.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_kernel.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    C = 1.0
    kernel = 'rbf'
    # cancer_dim
    # dna_dim
    for cancer_dim in params_svm['cancer_dim']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        if cancer_dim != 10:
            pca = PCA(n_components=cancer_dim)
            X_train_new = pca.fit_transform(X_train)
            X_test_new = pca.transform(X_test)
        else:
            X_train_new = X_train
            X_test_new = X_test
        if scale:
            scaler = StandardScaler()
            X_train_new = scaler.fit_transform(X_train_new)
            X_test_new = scaler.fit_transform(X_test_new)
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train_new, y_train)
        score = model.score(X_test_new, y_test)
        end = time.time()
        with open('cancer_dim.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, cancer_dim, score, end - start))
    for dna_dim in params_svm['dna_dim']:
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        if dna_dim != 180:
            pca = PCA(n_components=dna_dim)
            X_train_new = pca.fit_transform(X_train)
            X_test_new = pca.transform(X_test)
        else:
            X_train_new = X_train
            X_test_new = X_test
        if scale:
            scaler = StandardScaler()
            X_train_new = scaler.fit_transform(X_train_new)
            X_test_new = scaler.fit_transform(X_test_new)
        model = SVC(C=C, kernel=kernel)
        model.fit(X_train_new, y_train)
        score = model.score(X_test_new, y_test)
        end = time.time()
        with open('dna_dim.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, C=%f, kernel=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, C, kernel, dna_dim, score, end - start))
    def Score(self):
        ## data 
        ########################################################################
        model_1 = RandomForestRegressor(max_depth=15,random_state=0)
        model_2 = LinearRegression(fit_intercept=True)
        model_3 = Ridge(alpha=5)
        model_4 = Lasso(alpha=10)
        model_5 = SVR(C=2.5, epsilon=0.5)
        model_6 = GradientBoostingRegressor(random_state=0)
        model_7 = PoissonRegressor()
        
        

        MSE = []
        R2 = []
        for mymodels in [model_1,model_2,model_3,model_4,model_5,model_6,model_7]:
            model_pipeline = Pipeline(steps=[('pre_processing',self.pre_process),('scaler', StandardScaler()),('reduce_dim', PCA()),
                                 ('model', mymodels)
                                 ])
            model_pipeline.fit(self.X_train,self.y_train)
            MSE.append(mean_squared_error(self.y_train,model_pipeline.predict(self.X_train))**0.5)
            R2.append(r2_score(self.y_train,model_pipeline.predict(self.X_train)))
    
        print(np.round(MSE,2))   
        print(np.round(R2,2))
activity = pd.read_csv('./evaluate/thirtydays_final.csv', delimiter=',')

#activity1 = pd.read_csv('./datas/new_novins.csv',  delimiter = ',')

#pd.to_datetime(activity['date'])
activity.dropna(inplace=True)
del activity['date']

# convert to standard form

s = StandardScaler().fit_transform(activity)
normalize = pd.DataFrame(data=s)
print(normalize.head())

#do the PCA
pca = PCA(n_components=3)
prin_Comp = pca.fit_transform(s)
prin_CompDf = pd.DataFrame(data=prin_Comp,
                           columns=['prin_comp1', 'prin_comp2', 'prin_comp3'])

prin_CompDf.head()

# Join the label to the data and un-comment 'for label data below'
# pca_data = pd.concat([prin_CompDf, activity[['0']]], axis = 1)
# print(pca_data.head(5))

# for no-label data
# normalize.to_csv('./datas/normalize.csv')
prin_CompDf.to_csv('./evaluate/thirtydays_feature.csv')
plt.semilogy(prin_CompDf, '--o')
plt.title('Feature after PCA')
Пример #44
0
def runMLP():
    # scale
    test_size = 0.4
    layers = (100, )
    activation = 'relu'
    alpha = 0.0001
    lr = 'adaptive'
    cancer_dim = 10
    dna_dim = 180
    for scale in params_mlp['scale']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_scale.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_scale.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
    
    scale = False
    # test_size
    layers = (100, )
    activation = 'relu'
    alpha = 0.0001
    lr = 'adaptive'
    cancer_dim = 10
    dna_dim = 180
    for test_size in params_mlp['test_size']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_test_size.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_test_size.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    # layers
    activation = 'relu'
    alpha = 0.0001
    lr = 'adaptive'
    cancer_dim = 10
    dna_dim = 180
    for layers in params_mlp['layers']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_layers.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_layers.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    layers = (100, )
    # activation
    alpha = 0.0001
    lr = 'adaptive'
    cancer_dim = 10
    dna_dim = 180
    for activation in params_mlp['activation']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_activation.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_activation.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    layers = (100, )
    activation = 'relu'
    # alpha
    lr = 'adaptive'
    cancer_dim = 10
    dna_dim = 180
    for alpha in params_mlp['alpha']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_alpha.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_alpha.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    layers = (100, )
    activation = 'relu'
    alpha = 0.0001
    # lr
    cancer_dim = 10
    dna_dim = 180
    for lr in params_mlp['lr']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('cancer_lr.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start))
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        end = time.time()
        with open('dna_lr.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
    
    scale = False
    test_size = 0.4
    layers = (100, )
    activation = 'relu'
    alpha = 0.0001
    lr = 'adaptive'
    # cancer_dim
    # dna_dim
    for cancer_dim in params_mlp['cancer_dim']:
        X_train, X_test, y_train, y_test = train_test_split(cancer_X, cancer_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        if cancer_dim != 10:
            pca = PCA(n_components=cancer_dim)
            X_train_new = pca.fit_transform(X_train)
            X_test_new = pca.transform(X_test)
        else:
            X_train_new = X_train
            X_test_new = X_test
        if scale:
            scaler = StandardScaler()
            X_train_new = scaler.fit_transform(X_train_new)
            X_test_new = scaler.fit_transform(X_test_new)
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train_new, y_train)
        score = model.score(X_test_new, y_test)
        end = time.time()
        with open('cancer_dim.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, cancer_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, cancer_dim, score, end - start))
    for dna_dim in params_mlp['dna_dim']:
        X_train, X_test, y_train, y_test = train_test_split(dna_X, dna_y, test_size=test_size)
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
        start = time.time()
        if dna_dim != 180:
            pca = PCA(n_components=dna_dim)
            X_train_new = pca.fit_transform(X_train)
            X_test_new = pca.transform(X_test)
        else:
            X_train_new = X_train
            X_test_new = X_test
        if scale:
            scaler = StandardScaler()
            X_train_new = scaler.fit_transform(X_train_new)
            X_test_new = scaler.fit_transform(X_test_new)
        model = MLPClassifier(hidden_layer_sizes=layers, activation=activation, alpha=alpha, learning_rate=lr, max_iter=1000)
        model.fit(X_train_new, y_train)
        score = model.score(X_test_new, y_test)
        end = time.time()
        with open('dna_dim.txt', 'a') as f:
            f.write("scale=%s, test_size=%f, layers=%s, activation=%s, alpha=%f, lr=%s, dna_dim=%d, score=%f, time=%fs\n" % (str(scale), test_size, str(layers), activation, alpha, lr, dna_dim, score, end - start))
Пример #45
0
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import RidgeCV

COLUMN_NAMES = ["GRE", "TOEFL", "University Rating", "SOP", "Recommendation", "CGPA", "Research"]


A = np.loadtxt("Admissions.csv", delimiter=",")
X = A[:, 0:-1] # Independent variables
y = A[:, -1] # Dependent variable (chance of acceptance)

print(X.shape)

# Plot PCA
plt.figure(figsize=(10, 10))
pca = PCA(n_components=2)
Y = pca.fit_transform(X)
plt.scatter(Y[:, 0], Y[:, 1], c=y)
plt.colorbar()

# Perform cross-validated ridge regression
clf = RidgeCV(alphas=[1e-2, 1e-1, 1, 10]).fit(X, y)
print(clf.score(X, y))

# Do a scatterplot of predicted versus actual
coeff = clf.coef_
ypred = X.dot(coeff)
plt.figure(figsize=(8, 8))
plt.scatter(y, ypred)
plt.xlabel("Actual Chance")
plt.ylabel("Predicted Chance")
def reconstructRF():
    """
    run KFOLD method for random forest regression 
    """
    #import packages
    import os
    import numpy as np
    import pandas as pd
    #from sklearn import metrics
    #from scipy import stats
    #import seaborn as sns
    #import matplotlib.pyplot as plt
    #from sklearn.model_selection import KFold
    from datetime import datetime
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    
    
   #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    # #load KFOLD result csv file
    # os.chdir('F:\\06_eraint_results\\sonstig')
    # kf_dat = pd.read_csv('eraint_randForest_kfold.csv')
    # #edit the tg names to be usable later on
    # editName = lambda x: x.split('.csv')[0]
    # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg'])
    
    
    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    

    x = 129
    y = 130

    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        #get the number of PCs used during validation
        # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs']
        pca = PCA(0.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        
        {# #apply 10 fold cross validation
        # kf = KFold(n_splits=10, random_state=29)
        
        # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        # for train_index, test_index in kf.split(X):
        #     X_train, X_test = X_pca[train_index], X_pca[test_index]
        #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
        #     #train regression model
        #     rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1)
        #     lm.fit(X_train, y_train)
            
        #     #predictions
        #     predictions = lm.predict(X_test)
        #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
        #     #                       pd.DataFrame(np.array(y_test))], \
        #     #                      axis = 1)
        #     # pred_obs.columns = ['pred', 'obs']
        #     # combo = pd.concat([combo, pred_obs], axis = 0)    
            
        #     #evaluation matrix - check p value
        #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
        #         print("insignificant correlation!")
        #         continue
        #     else:
        #         #print(stats.pearsonr(y_test, predictions))
        #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
        #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
        #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        # #number of years used to train/test model
        # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
        #                       pred_surge['date'][0]).days/365)
            }
        
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        # corr = np.mean(metric_corr)
        # rmse = np.mean(metric_rmse)
        
        # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
        #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
        #       np.mean(metric_rmse), '\n')
        
        #%%
        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis = 1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis = 1)
        
        
        #standardize predictor data
        dat = pred_for_recon.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred_for_recon['date'], dat_standardized], axis = 1)
        
        X_recon = pred_standardized.iloc[:, 1:]
        
        #apply PCA
        pca = PCA(num_pc) #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)
    
        #%%
        #model preparation
        #defining the rf model with number of trees and minimum leaves
        rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \
                                   random_state = 29)
        rf.fit(X_pca, y)
        
        #get prediction interval
        def pred_ints(model, X_pca_recon, percentile = 95):
            """
            function to construct prediction interval
            taking into account the result of each 
            regression tree
            """
            err_down = [];
            err_up = [];
            preds= [];
            
            for pred in model.estimators_:
                preds.append(pred.predict(X_pca_recon))
            preds = np.vstack(preds).T
            err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \
                                     keepdims = True)
            err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \
                                   keepdims = True)
        
            return err_down.reshape(-1), err_up.reshape(-1)
        
        
        #compute 95% prediction intervals
        err_down, err_up = pred_ints(rf, X_pca_recon, percentile = 95);
        #reconstructed surge goes here
        truth = rf.predict(X_pca_recon);
        
        correct = 0.;
        for i, val in enumerate(truth):
            if err_down[i] <= val <= err_up[i]:
                correct +=1
        print(correct*100/len(truth), '\n')
        
        
        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], \
                               pd.DataFrame([truth, err_down, err_up]).T], axis = 1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']
        
        {#plot - optional
        # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
        # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
        # sns.set_context('notebook', font_scale = 2)
        # plt.figure()
        # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
        # plt.scatter(surge['date'], surge['surge'], color = 'blue')
        #prediction intervals
        # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
        # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
        #confidence intervals
        # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
        # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
Пример #47
0
def fit_pca(fv):
    scaled_new_fv = scale_data(fv)
    pca = PCA()
    pca.fit(fv)
    pca_fv = pca.transform(fv)
    return pca_fv
Пример #48
0
#
# License: BSD 3 clause

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

iris = load_iris()

X, y = iris.data, iris.target

# This dataset is way too high-dimensional. Better do PCA:
pca = PCA(n_components=2)

# Maybe some original features where good, too?
selection = SelectKBest(k=1)

# Build estimator from PCA and Univariate selection:

combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

# Use combined features to transform dataset:
X_features = combined_features.fit(X, y).transform(X)
print("Combined space has", X_features.shape[1], "features")

svm = SVC(kernel="linear")

# Do grid search over k, n_components and C:
Пример #49
0
sns.heatmap(correlation, annot=True)
plt.show()

print('end')

from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=5, random_state=1)
good_columns = nba._get_numeric_data().dropna(axis=1)
kmeans_model.fit(good_columns)
labels = kmeans_model.labels_
print(labels)

from sklearn.decomposition import PCA

pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(good_columns)
plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=labels)
plt.show()

# Find player LeBron
LeBron = good_columns.loc[nba['Player'] == 'LeBron James\jamesle01', :]

#Find player Durant
Durant = good_columns.loc[nba['Player'] == 'Kevin Durant\duranke01', :]

#print the players
print(LeBron)
print(Durant)

#Change the dataframes to a list
Пример #50
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun  6 15:36:36 2019

@author: KIIT
"""

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
df=pd.read_csv('crime_data.csv')
features=df.iloc[:,[1,2,4]].values
pca=PCA(n_components=2)
features=pca.fit_transform(features)
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 0)
pred_cluster = kmeans.fit_predict(features)
plt.scatter(features[pred_cluster == 0, 0], features[pred_cluster == 0, 1], c = 'blue', label = 'LowCrime')
plt.scatter(features[pred_cluster == 1, 0], features[pred_cluster == 1, 1], c = 'red', label = 'MedCrime')
plt.scatter(features[pred_cluster == 2, 0], features[pred_cluster == 2, 1], c = 'green', label = 'HighCrime')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c = 'yellow', label = 'Centroids')
plt.title('Crime Data')
plt.xlabel('P1 Features')
plt.ylabel('P2 Features')
plt.legend()
plt.show()




Пример #51
0
def pca2d_OnClickFit(atoms, colors):
    from sklearn.decomposition import PCA
    from matplotlib.lines import Line2D
    #from numpy import arange, cos, linspace, pi, sin, random
    from scipy.interpolate import splprep, splev

    ### Calculate eigenvectors
    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(atoms)

    # draw a scatter plot of the generated values
    fig = plt.figure(figsize=(20, 16))
    ax = fig.add_subplot(111)

    # legend elements
    base_colors = [
        'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'indigo',
        'burlywood', 'darksalmon', 'darkviolet'
    ]
    legend_elements = [
        Line2D([0], [0],
               marker='o',
               color=base_colors[0],
               label='5%',
               markerfacecolor=base_colors[0],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[1],
               label='10%',
               markerfacecolor=base_colors[1],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[2],
               label='20%',
               markerfacecolor=base_colors[2],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[3],
               label='30%',
               markerfacecolor=base_colors[3],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[4],
               label='40%',
               markerfacecolor=base_colors[4],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[5],
               label='50%',
               markerfacecolor=base_colors[5],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[6],
               label='60%',
               markerfacecolor=base_colors[6],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[7],
               label='70%',
               markerfacecolor=base_colors[7],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[8],
               label='80%',
               markerfacecolor=base_colors[8],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[9],
               label='90%',
               markerfacecolor=base_colors[9],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[10],
               label='100%',
               markerfacecolor=base_colors[10],
               markersize=10),
    ]
    plt.legend(loc='upper left', handles=legend_elements, frameon=False)
    plt.title(
        'Para-aminobenzamidine Morphology - Principal Components Analysis 5-atoms'
    )

    # extract the scatterplot drawing in a separate function so we ca re-use the code
    ax.scatter(X_reduced[:, 0],
               X_reduced[:, 1],
               c=colors,
               alpha=0.8,
               edgecolor='none',
               s=50,
               picker=True)

    # define the behaviour -> what happens when you pick a dot on the scatterplot by clicking close to it
    def onpick(event):
        # Create embedded figure
        # spline parameters

        s = 3.0  # smoothness parameter
        k = 2  # spline order

        figi = plt.figure()
        for subplotnum, dataind in enumerate(event.ind):
            ax = Axes3D(figi)
            data = atoms[dataind].reshape(12, 3)
            tckp, u = splprep([data[:, 0], data[:, 1], data[:, 2]],
                              s=s,
                              k=k,
                              nest=-1)  # find the knot points
            xnew, ynew, znew = splev(np.linspace(0, 1, 400), tckp)
            print xnew, ynew, znew
            ax.plot(data[:, 0], data[:, 1], data[:, 2], 'o', c=colors[dataind])
            ax.plot(xnew, ynew, znew, 'r-', label='fit', c='k')
            ax.set_xlabel('X Axis')
            ax.set_ylabel('Y Axis')
            ax.set_zlabel('Y Axis')
            ax.set_title('Spacer Arm Coordinates')
        figi.show()

    # connect the click handler function to the scatterplot
    fig.canvas.mpl_connect('pick_event', onpick)
    plt.show()
    fig.savefig('pca2d-interactive.png')
Пример #52
0
def validateRF():
    """
    run KFOLD method for regression 
    """
    
    #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    
    
    x = 66
    y = 67
    
    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])
    
    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)
        
        #filter only .csv files
        tgNames = []
        for file in glob.glob("*.csv"):
            tgNames.append(file)
            

        tg_name = sorted(tgNames)[tg]
        print(tg_name)
        
        
        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            print("this tide gauge is already taken care of")
            return "file already analyzed!"
        

        os.chdir(dir_in)
        
        
        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)
        
        metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
            #train regression model
            rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \
                                      min_samples_leaf = 1)
            rf.fit(X_train, y_train)
            
            #predictions
            predictions = rf.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)    
            
            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                print()
                metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)
        
        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')
        
        #original size and pca size of matrix added
        new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis = 0)
        
        
        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
def test_CNN_model_Cifar(num_classes, model_name, x_train_bis, y_train_bis,
                         x_test_bis, y_test_bis, x_other, y_other, 
                         training = False, plot = True):
    print("Testing CNN Model : {} on Cifar10 ...".format(model_name))
    # --------------------------------------
    # CNN network definition
    # --------------------------------------
    
    # Network params
      
    conv_depth_1 = 100
    kernel_size_1 = 3
    
    conv_depth_2 = 100 
    kernel_size_2 = 3
    pool_size_2 = 2
    
    conv_depth_3 = 200 
    kernel_size_3 = 3
    
    conv_depth_4 = 200 
    kernel_size_4 = 3
    
    conv_depth_5 = 400 
    kernel_size_5 = 3
    pool_size_5 = 2
    
    hidden_size_1 = 600
    
    weight_penalty = 0.0001 
    
    
    model = Sequential()
    
    model.add(Conv2D(conv_depth_1, (kernel_size_1, kernel_size_1), padding='same',
                     input_shape=x_train_bis.shape[1:]))
    model.add(Activation('relu'))
    
    model.add(Conv2D(conv_depth_2, (kernel_size_2, kernel_size_2), padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(pool_size_2, pool_size_2)))
    model.add(Dropout(0.3))
    
    model.add(Conv2D(conv_depth_3, (kernel_size_3, kernel_size_3), padding='same', 
                     kernel_regularizer=regularizers.l2(weight_penalty)))
    model.add(Activation('relu'))
    
    model.add(Conv2D(conv_depth_4, (kernel_size_4, kernel_size_4), padding='same',
                     kernel_regularizer=regularizers.l2(weight_penalty)))
    model.add(Activation('relu'))
    
    model.add(Conv2D(conv_depth_5, (kernel_size_5, kernel_size_5), padding='same', 
                     kernel_regularizer=regularizers.l2(weight_penalty)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(pool_size_5, pool_size_5)))
    model.add(Dropout(0.3))
    
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(hidden_size_1, 
                    kernel_regularizer=regularizers.l2(weight_penalty)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    
    # Training on whole dataset
    if (training):
        learning_rate = 0.0001
        batch_size = 32    
        num_epochs = 50
        opt = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, amsgrad=False)
        #opt = RMSprop(lr=learning_rate,decay=1e-6)
        
        model.compile(loss='categorical_crossentropy', 
                       optimizer=opt, metrics=['accuracy'])
        history = model.fit(x_train_bis, y_train_bis, verbose = True, 
                            epochs = num_epochs, batch_size=batch_size,
                            validation_split=0.2, shuffle = True)
        
        
        model.save(model_name)
    
    # Loading the model:
    if not training:
        print("Loading model...")
        model = load_model(model_name)
        print("Done.")
    # Test
    print("Evaluating Model...")
    print("Performance on train:", model.evaluate(x_train_bis, y_train_bis))
    print("Performance on test:", model.evaluate(x_test_bis, y_test_bis))
    
    ##### FEATURE EXTRACTION #####
    print("Extracting Feature for some Examples...")
    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=model.layers[-3].output)
    
    results = intermediate_layer_model.predict(x_other)
    
    # Extraction of some examples
    nb_examples = 700
    state = True
    i = 0
    results_per_class = [0]*(10-num_classes)
    while state:
      x = intermediate_layer_model.predict(np.array([x_other[i]]))[0]
      y = y_other[i,0]
      if type(results_per_class[y - num_classes]) == type(0): 
        results_per_class[y - num_classes] = [x]
      else:
        if len(results_per_class[y - num_classes]) < nb_examples:
          results_per_class[y - num_classes].append(x)
        else:
          stop = True
          for elem in results_per_class:
            if type(elem) == type(0): 
              stop = False
              break
            if len(elem) < nb_examples:
              stop = False
              break
          if stop: state = False
      i += 1
    results_per_class = np.array(results_per_class)
    print("Done.")
    
    # Ploting the last feature for the new classes with dimension reduction 
    # methods:
    if (plot):
        print("Ploting results with PCA and TSNE...")
        N = 3000
        feat_cols = [ 'index'+str(i) for i in range(results.shape[1]) ]
        df = pd.DataFrame(results,columns=feat_cols)
        df['y'] = y_other
        df['label'] = df['y'].apply(lambda i: str(i))
        print('Size of the dataframe: {}'.format(df.shape))
        
        ########### PCA ##########
        # np.random.seed(42)
        rndperm = np.random.permutation(df.shape[0])
        df_subset = df.loc[rndperm[:N],:].copy()
        data_subset = df_subset[feat_cols].values
        pca = PCA(n_components=3)
        pca_result = pca.fit_transform(data_subset)
        df_subset['pca-one'] = pca_result[:,0]
        df_subset['pca-two'] = pca_result[:,1] 
        df_subset['pca-three'] = pca_result[:,2]
        
        print('Explained variation per \
          principal component: {}'.format(pca.explained_variance_ratio_))
        print('Size of the dataframe: {}'.format(df_subset.shape))
        ax = plt.figure(figsize=(16,10)).gca(projection='3d')
        ax.scatter(
            xs=df_subset.loc[rndperm,:]["pca-one"], 
            ys=df_subset.loc[rndperm,:]["pca-two"], 
            zs=df_subset.loc[rndperm,:]["pca-three"], 
            c=df.loc[rndperm,:]["y"], cmap='tab10'
        )
        ax.set_xlabel('pca-one')
        ax.set_ylabel('pca-two')
        ax.set_zlabel('pca-three')
        plt.show()
        
        ######## T-SNE Method ########
        data_subset = df_subset[feat_cols].values
        time_start = time.time()
        tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
        tsne_results = tsne.fit_transform(data_subset)
        print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
        df_subset['tsne-2d-one'] = tsne_results[:,0]
        df_subset['tsne-2d-two'] = tsne_results[:,1]
        
        plt.figure(figsize=(16,10))
        sns.scatterplot(
            x="tsne-2d-one", y="tsne-2d-two",hue="y",
            palette=sns.color_palette("hls", 10-num_classes),
            data=df_subset, legend="full", alpha=0.3)
        plt.show()
        print("Done.")
    
    # Prediction evaluation on new classes:
    
    print("Computing the confusion matrix for one-shot learning on 300 different references...")
    # Tests:
    confusion_matrix = np.zeros((10-num_classes, 10 - num_classes), dtype = int)
    # ligne : prediction
    # colonne : reference
    nb_of_different_references = 100
    accuracies = np.zeros((10-num_classes, nb_of_different_references))
    for p in range(nb_of_different_references):
      index_ref = rd.randint(0, nb_examples-1)
      references = results_per_class[:,index_ref]
      for i in range(10-num_classes):
        nb_mistakes = 0
        k = 0
        for elem in results_per_class[i,:]:
          if k != index_ref:
            index = compute_nearest_neighbor(elem, references)
            confusion_matrix[index, i] += 1
            if index != i:
              nb_mistakes += 1
          k += 1
        accuracies[i, p] = 1 - nb_mistakes/(nb_examples-1)  
    for num_class in range(10-num_classes):
      print("Class n°", num_class+num_classes,"accuracy:", 
            np.mean(accuracies[num_class,:]))
    print("Confusion Matrix:")
    print(confusion_matrix)
    print("Done.")
    
    print("Computing the confusion matrix for 3-shots learning on 100 different references...")
    confusion_matrix = np.zeros((10-num_classes, 10 - num_classes), dtype = int)
    # ligne : prediction
    # colonne : reference
    nb_of_different_references = 100
    accuracies = np.zeros((10-num_classes, nb_of_different_references))
    for p in range(nb_of_different_references):
      index_ref = rd.randint(0, nb_examples-3)
      references = results_per_class[:,index_ref] + results_per_class[:, index_ref +1 ] + \
      results_per_class[:,index_ref+2]
      references /= 3
      for i in range(10-num_classes):
        nb_mistakes = 0
        k = 0
        for elem in results_per_class[i,:]:
          if k != index_ref:
            index = compute_nearest_neighbor(elem, references)
            confusion_matrix[index, i] += 1
            if index != i:
              nb_mistakes += 1
          k += 1
        accuracies[i, p] = 1 - nb_mistakes/(nb_examples-1)  
    for num_class in range(10-num_classes):
      print("Class n°", num_class+num_classes,"accuracy:", 
            np.mean(accuracies[num_class,:]))
    print("Confusion Matrix:")
    print(confusion_matrix)
    print("Done. End Testing.")
Пример #54
0
    )
    parser.add_argument("--verbose", type=int, default=0)
    parser.add_argument(
        "--pca-components",
        type=int,
        default=50,
        help="Number of principal components for preprocessing.",
    )
    args = parser.parse_args()

    print("Used number of threads: {}".format(_openmp_effective_n_threads()))
    X, y = load_data(order=args.order)

    if args.pca_components > 0:
        t0 = time()
        X = PCA(n_components=args.pca_components).fit_transform(X)
        print(
            "PCA preprocessing down to {} dimensions took {:0.3f}s".format(
                args.pca_components, time() - t0
            )
        )

    methods = []

    # Put TSNE in methods
    tsne = TSNE(
        n_components=2,
        init="pca",
        perplexity=args.perplexity,
        verbose=args.verbose,
        n_iter=1000,
Пример #55
0
df = pd.DataFrame(data=datas, columns=['x1', 'x2', 'x3', 'x4', 'x5'])
axes = pd.plotting.scatter_matrix(df,
                                  alpha=0.9,
                                  figsize=(7, 7),
                                  c='blue',
                                  s=80)
plt.tight_layout()
plt.show()
plt.savefig('scatter_matrix.png')

fig = plt.figure(figsize=(7, 7))
d = spc(datas) + Tsquare_single()
print(d)

d2 = PCA(n_components=5)

datas_standard = (datas - np.mean(datas, axis=0)) / np.std(datas, axis=0)

transformed = d2.fit_transform(datas_standard)

Lambda = d2.explained_variance_

UCL = 3 * sqrt(Lambda[0])
LCL = -3 * sqrt(Lambda[0])

fig = plt.figure(figsize=(7, 7))
plt.plot(transformed[:, 0],
         marker='.',
         markersize='15',
         linestyle='--',
        cluster_df = new_df.loc[df['Cluster']==i]
        stratified_data = stratified_data.append(cluster_df.sample(frac=.25))
    stratified_data = stratified_data.drop('Cluster', axis =1)
    pca = PCA(n_components=3)
    pca.fit_transform(stratified_data)
    loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=stratified_data.columns)
    load= loadings.apply(np.square)
    load["Sum_of_Squares"]= load.apply(np.sum, axis=1)
    load = load.sort_values(by=['Sum_of_Squares'], ascending=False )
    top2PCA = list(zip(stratified_data.values[:,3], stratified_data.values[:,4],stratified_data.values[:,6]))
    return json.dumps(top2PCA)




pca = PCA(n_components=17,random_state=0)

original_df = pca.fit_transform(StandardScaler.fit_transform(df))
PCA_Array = pca.explained_variance_ratio_
calculate_percent = toPercent(PCA_Array)
cumulative_sum_var = cumulative_sum(calculate_percent)

random_df = pca.fit_transform(StandardScaler.fit_transform(randomSampledData()))
PCA_Array_random = pca.explained_variance_ratio_
calculate_percent_random = toPercent(PCA_Array_random)
cumulative_sum_random = cumulative_sum(calculate_percent_random)

stratified_df = pca.fit_transform(StandardScaler.fit_transform(stratified_data()))
PCA_Array_stratified = pca.explained_variance_ratio_
calculate_percent_stratified = toPercent(PCA_Array_stratified)
cumulative_sum_stratified = cumulative_sum(calculate_percent_stratified)
Пример #57
0
def pca2d(atoms, colors):
    from sklearn.decomposition import PCA
    from matplotlib.lines import Line2D

    ### Calculate eigenvectors
    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(atoms)
    plt.title('PCA2D Clustering')
    plt.scatter(X_reduced[:, 0],
                X_reduced[:, 1],
                c=colors,
                alpha=0.8,
                edgecolor='none')

    base_colors = [
        'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'indigo',
        'burlywood', 'darksalmon', 'darkviolet'
    ]
    legend_elements = [
        Line2D([0], [0],
               marker='o',
               color=base_colors[0],
               label='5%',
               markerfacecolor=base_colors[0],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[1],
               label='10%',
               markerfacecolor=base_colors[1],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[2],
               label='20%',
               markerfacecolor=base_colors[2],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[3],
               label='30%',
               markerfacecolor=base_colors[3],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[4],
               label='40%',
               markerfacecolor=base_colors[4],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[5],
               label='50%',
               markerfacecolor=base_colors[5],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[6],
               label='60%',
               markerfacecolor=base_colors[6],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[7],
               label='70%',
               markerfacecolor=base_colors[7],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[8],
               label='80%',
               markerfacecolor=base_colors[8],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[9],
               label='90%',
               markerfacecolor=base_colors[9],
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color=base_colors[10],
               label='100%',
               markerfacecolor=base_colors[10],
               markersize=10),
    ]
    plt.legend(loc='upper left', handles=legend_elements, frameon=False)
    plt.show()
    plt.savefig('pca2d.png')
Пример #58
0
#logistic回归
from sklearn.linear_model.logistic import LogisticRegression, LogisticRegressionCV

# naive_bayes
# 见 D:\opensource\scrapy-work\wolf_nlp\算法学习笔记\NLP汉语自然语言处理原理与实践-读书笔记/20180424-bayes.md
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
# 会出现共线性问题
# 重复的词,多项式(出现多次),伯努利(出现1次),混合模型(计算句子概率时计算1次,统计时统计多次)

#降维算法 t-SNE PCA
# tsne保留下的属性信息,更具代表性,也即最能体现样本间的差异
# tsne运行极慢,PCA则相对较快
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
data_pca = PCA(n_components=50).fit_transform(data)
data_pca_tsne = TSNE(n_components=2).fit_transform(data_pca)

#构建词袋模型
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 2, 3))  #单个词、2元词组、3元词组全部获取,生成词袋模型
train_data_features = vectorizer.fit_transform().toarray()  #转成词袋模型进行编码

#数据预处理
#数据的幅度缩放 标准化
from sklearn.preprocessing import MinMaxScaler,StandardScaler,scale
#独热向量编码
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import minmax_scale
minmax_scale.fit_transform()
Пример #59
0
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
            name="k-means++", data=data)
bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
              name="random", data=data)
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
              name="PCA-based",
              data=data)
print(79 * '_')

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
Пример #60
0
def principal_component_analysis(n):
    pca = PCA(n_components=n)
    data = pca.fit_transform(x)
    predictions = cross_val_predict(clf, data, y, cv=10)
    print(metrics.r2_score(y, predictions))
    simple_plot(predictions, "PCA n=" + str(n))