def PLS(mol_a, mol_i): X = [] Y = [] for mol in mol_a: b = np.array([ mol.x - center[0], mol.y - center[1], mol.z - center[2], mol.dg, mol.dh, mol.tds ]) X.append(b) Y.append(np.array([1, 1])) #Y = [] for mol in mol_i: b = np.array([ mol.x - center[0], mol.y - center[1], mol.z - center[2], mol.dg, mol.dh, mol.tds ]) X.append(b) Y.append(np.array([0, 0])) pls2 = PLSRegression(n_components=2) x_scores, y_scores = pls2.fit_transform(X, Y) plt.figure(10, figsize=(5, 5)) plt.scatter(x_scores[:, 0], x_scores[:, 1]) plt.scatter(y_scores[:, 0], y_scores[:, 1]) plt.show() print(np.shape(x_scores)) print(np.shape(y_scores))
def pls_da(df, n): X = df.iloc[:, 0:-1] y = df.iloc[:, -1] # y_class = pd.get_dummies(y) plsda = PLSRegression(n_components=n) reduced_x = plsda.fit_transform(X, y) return reduced_x[0]
def train_model(model_name, X, y, save=True): # window_sizes = [128, 256, 512, 1024] # angles = ["right_shoulder", "left_shoulder", "right_elbow", "left_elbow", "right_hip", "left_hip", "right_knee", # "left_knee"] model_dict = {} pls = PLSRegression(n_components=5) X = pls.fit_transform(X, y)[0] if "lscp" in model_name: model = construct_lscp() elif "xgbod" in model_name: model = construct_xgbod() elif "simple-mean" in model_name: model = construct_simple_aggregator("average") elif "simple-max" in model_name: model = construct_simple_aggregator("maximization") model.fit(X, y) model_dict = {"pls": pls, "model": model} if save: if not os.path.exists("saved_models"): os.mkdir("saved_models") save_path = os.path.join("saved_models", model_name + ".joblib") joblib.dump(model_dict, save_path) return model_dict
def PLS_DA(datos): global pls_bi datos_bi = datos[(datos['etiqueta'] == 5 ) | (datos['etiqueta'] == 6)] X_bi = savgol_filter(datos_bi.values[:,2:], 15, polyorder = 3, deriv=0) y_biP = datos_bi["etiqueta"].values y_bi = (y_biP == 6).astype('uint8') pls_bi = PLSRegression(n_components=2) X_pls = pls_bi.fit_transform(X_bi, y_bi)[0] labplot = ["60/40 ratio", "50/50 ratio"] unique = list(set(y_bi)) colors = [plt.cm.jet(float(i)/max(unique)) for i in unique] with plt.style.context(('ggplot')): plt.figure(figsize=(12,10)) for i, u in enumerate(unique): col = np.expand_dims(np.array(colors[i]), axis=0) x = [X_pls[j,0] for j in range(len(X_pls[:,0])) if y_bi[j] == u] y = [X_pls[j,1] for j in range(len(X_pls[:,1])) if y_bi[j] == u] plt.scatter(x, y, c=col, s=100, edgecolors='k',label=str(u)) plt.xlabel('Variable Latente 1') plt.ylabel('Variable Latente 2') plt.legend(labplot,loc='lower left') plt.title('Descomposición cruzada PLS') plt.show() X_entreno, X_prueba, y_entreno, y_prueba = train_test_split(X_bi, y_bi, test_size=0.2, random_state=19) pls_bi = PLSRegression(n_components=2) pls_bi.fit(X_entreno, y_entreno) y_prediccion1 = pls_bi.predict(X_prueba)[:,0] prediccion_binaria1 = (pls_bi.predict(X_prueba)[:,0] > 0.5).astype('uint8') print(prediccion_binaria1, y_prueba) precision = [] A=[] m=0 cvalor = KFold(n_splits=40, shuffle=True, random_state=19) for train, test in cvalor.split(X_bi): y_prediccion = PLS_DA1(X_bi[train,:], y_bi[train], X_bi[test,:]) A.append(y_prediccion) precision.append(accuracy_score(y_bi[test], y_prediccion)) m=m+1 print("Precisión Promedio para 10 Divisiones: ", np.array(precision).mean()) return prediccion_binaria1, precision
def test_sanity_check_pls_regression(): # Sanity check for PLSRegression # The results were checked against the R-packages plspm, misOmics and pls d = load_linnerud() X = d.data Y = d.target pls = PLSRegression(n_components=X.shape[1]) X_trans, _ = pls.fit_transform(X, Y) # FIXME: one would expect y_trans == pls.y_scores_ but this is not # the case. # xref: https://github.com/scikit-learn/scikit-learn/issues/22420 assert_allclose(X_trans, pls.x_scores_) expected_x_weights = np.array([ [-0.61330704, -0.00443647, 0.78983213], [-0.74697144, -0.32172099, -0.58183269], [-0.25668686, 0.94682413, -0.19399983], ]) expected_x_loadings = np.array([ [-0.61470416, -0.24574278, 0.78983213], [-0.65625755, -0.14396183, -0.58183269], [-0.51733059, 1.00609417, -0.19399983], ]) expected_y_weights = np.array([ [+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916], ]) expected_y_loadings = np.array([ [+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916], ]) assert_array_almost_equal(np.abs(pls.x_loadings_), np.abs(expected_x_loadings)) assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights)) assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings)) assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights)) # The R / Python difference in the signs should be consistent across # loadings, weights, etc. x_loadings_sign_flip = np.sign(pls.x_loadings_ / expected_x_loadings) x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights) y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights) y_loadings_sign_flip = np.sign(pls.y_loadings_ / expected_y_loadings) assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip) assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip)
def ELM_test(split, source_name, target_name): source_path = os.path.join(data_path, source_name + "_SURF_L10.mat") target_path = os.path.join(data_path, target_name + "_SURF_L10.mat") train_x_ss, train_y_ss, train_x_ts, train_y_ts, test_xs, test_ys = load_mmdt_split( split, source_path, target_path) accs = [] for i in range(20): print("####### " + str(i + 1) + " #######") train_x_s, train_y_s, train_x_t, train_y_t, test_x, test_y = np.squeeze( train_x_ss[i]), np.squeeze(train_y_ss[i]), np.squeeze( train_x_ts[i]), np.squeeze(train_y_ts[i]), np.squeeze( test_xs[i]), np.squeeze(test_ys[i]) # source_dict,target_dict = {},{} # source_dict['fts'] = np.append(train_x_s,train_x_t,axis=0) # source_dict['labels'] = np.append(train_y_s,train_y_t,axis=0).T # savemat("./others/source.mat",source_dict) # target_dict['fts'] = test_x # target_dict['labels'] = test_y.T # savemat("./others/target.mat",target_dict) # train_x_s = (train_x_s-np.mean(train_x_s,1,keepdims=True))/np.std(train_x_s,1,keepdims=True) # train_x_t = (train_x_t-np.mean(train_x_t,1,keepdims=True))/np.std(train_x_t,1,keepdims=True) # test_x = (test_x-np.mean(test_x,1,keepdims=True))/np.std(test_x,1,keepdims=True) train_y_s = LabelTransform(train_y_s) train_y_t = LabelTransform(train_y_t) test_y = LabelTransform(test_y) pca = PCA(dim) pls = PLSRegression(dim) reduc = "pca" if reduc == "pls": train_x_s, _ = pls.fit_transform(train_x_s, train_y_s) paced = pca.fit_transform(np.row_stack([train_x_t, test_x])) train_x_t, test_x = paced[0:30, :], paced[30:, :] elif reduc == "pca": paced = pca.fit_transform( np.row_stack([train_x_s, train_x_t, test_x])) if source_name == "amazon": train_x_s, train_x_t, test_x = paced[0:200, :], paced[ 200:230, :], paced[230:, :] else: train_x_s, train_x_t, test_x = paced[0:80, :], paced[ 80:110, :], paced[110:, :] else: raise NotImplementedError ##train model net = ELM(train_x_s, train_y_s, test_x, test_y, 1000) net.ParamInit() net.Activation('relu') net.TrainELM('Lp', 0.01) net.TrainAccuracy('relu') net.TestAccuracy('relu') net.printf() accs.append(net.TestAcc) return np.mean(accs) * 100, np.std(accs) * 100 / np.sqrt(len(accs))
def Pls (df, df2, string): pls2 = PLSRegression(n_components=2) (xs,ys) = pls2.fit_transform(df,df2) t = df2.values principalDf = pd.DataFrame(data = xs , columns = ['pls 1', 'pls 2']) pls = cross_decomposition.PLSRegression(n_components = 10) pls.fit(df, df2) variance = np.var(pls.x_scores_, axis = 0) principalDf [string] = t return principalDf, variance
from sklearn.metrics import mean_squared_error, r2_score # In[136]: # Split data to train and test on 50-50 ratio X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=None) # In[137]: pls = PLSRegression(n_components=27) pls.fit(X_train, X_test) X_pls = pls.fit_transform(X_train, X_test) x2 =pls.transform(x) # In[138]: x2=pd.DataFrame(x2) print(x2) #x2= NormalizeData(x2) #print(X_pls) #two_arrays = X_pls #datapls = np.hstack(two_arrays) #np.savetxt('lungcancerpls111.csv', datapls, delimiter=',')
class PLSClassifier(BaseEstimator, ClassifierMixin): __name__ = 'MultiLayeredPLS' def __init__(self, estimator=None, n_iter=1500, eps=1e-6, n_comp=10, mode='regression'): warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") self.n_iter = n_iter self.eps = eps self.n_comp = n_comp self.mode = mode self.estimator = estimator self.estimator_ = None self.pls = None def fit(self, X, y): # if X is not np.array or y is not np.array: # print('x and y must be of type np.array') # raise ValueError if X.shape[0] != y.shape[0]: raise ValueError() if self.estimator is None: self.estimator_ = LinearRegression() else: self.estimator_ = sklearn.base.clone(self.estimator_) self.classes_, target = np.unique(y, return_inverse=True) target[target == 0] = -1 if self.mode == 'canonical': self.pls = PLSCanonical(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps) elif self.mode == 'regression': self.pls = PLSRegression(n_components=self.n_comp, scale=True, max_iter=self.n_iter, tol=self.eps) proj_x, proj_y = self.pls.fit_transform(X, target) self.estimator_.fit(proj_x, target) return self def predict_value(self, x): resp = self.decision_function(x) if resp.ndim == 1: ans = np.zeros(resp.shape, dtype=np.int32) ans[resp > 0] = self.classes_[1] ans[resp <= 0] = self.classes_[0] else: ans = self.classes_[np.argmax(resp, axis=1)] return ans def predict_confidence(self, x): resp = self.decision_function(x) return resp[0] def decision_function(self, x): x = np.array(x).reshape((1, -1)) proj = self.pls.transform(x) resp = self.estimator_.predict(proj) return resp def predict_proba(self, x): resp = self.decision_function(x) resp = np.min(-1, resp) resp = np.max(1, resp) resp -= 1 resp /= 2 # resp = np.exp(resp) # for r in range(len(resp)): # resp[r] /= np.sum(resp[r]) return resp
class PLS(Model): # X represents the features, Y represents the labels X = None Y = None prediction = None model = None def __init__(self, X=None, Y=None, n_components=2, type='regressor', cfg=False): self.name = 'PLS' if X is not None: self.X = X if Y is not None: self.Y = Y self.type = type self.cfg = cfg self.n_components = n_components self.model = PLSRegression(n_components=n_components) def fit(self, X=None, Y=None): if X is not None: self.X = X if Y is not None: self.Y = Y print('PLS Train started............') self.model.fit(self.X, self.Y) print('PLS completed..........') return self.model def fit_transform(self, X=None, Y=None): if X is not None: self.X = X if Y is not None: self.Y = Y print('PLS Train/Transform started............') self.X = self.model.fit_transform(self.X) print('PLS completed..........') self.X = pd.DataFrame(self.X) return self.X def predict(self, test_features): print('Prediction started............') self.predictions = self.model.predict(test_features) print('Prediction completed..........') return self.predictions def save(self): if self.cfg: f = open('pls_configs.txt', 'w') f.write(json.dumps(self.model.get_params())) f.close() print('No models will be saved for PLS') def featureImportance(self): # if X_headers is None: # X_headers = list(self.X) # # feature_importance_ = zip(self.model.coef_.reshape(1,-1)[0], X_headers) # feature_importance = set(feature_importance_) return self.model.coef_ def getAccuracy(self, test_labels, predictions, origin=0, hitmissr=0.8): correct = 0 df = pd.DataFrame(data=predictions.flatten()) for i in range(len(df)): if 1 - abs(df.values[i] - test_labels.values[i])/abs(df.values[i]) >= hitmissr: correct = correct + 1 return float(correct)/len(df) def getConfusionMatrix(self, test_labels, predictions, label_headers): return 'No Confusion Matrix for Regression' def getRSquare(self, test_labels, predictions, mode='single'): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': if mode == 'multiple': errors = r2_score(test_labels, df, multioutput='variance_weighted') else: errors = r2_score(test_labels, df) return errors else: return 'No RSquare for Classification' def getMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = mean_squared_error(test_labels, df) return errors else: return 'No MSE for Classification' def getMAPE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = np.mean(np.abs((test_labels - df.values) / test_labels)) * 100 return errors.values[0] else: return 'No MAPE for Classification' def getRMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = sqrt(mean_squared_error(test_labels, df)) return errors else: return 'No RMSE for Classification'
ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c) ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') plt.axis('equal') ax.set_xlim([-1000,4000]) ax.set_ylim([-1000,4000]) ax.set_zlim([-1000,4000]) plt.show() # part b PLS1 = PLS(n_components=3) number_map = {"M": 0,"B": 1} numeric_y = np.array(map(lambda x : number_map[x],y)) result = PLS1.fit_transform(x,numeric_y) X_r = result[0] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for c, i, target_name in zip("rb", target_names, target_names): ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c) ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') plt.axis('equal') plt.show() validation = data[:100] test = data[100:200] train = data[200:]
def plot_projections(holder, labels, preprocess_lda='PCA', class_name='Antioxidants', only_pca=False, binarize_class=True, standardize=True, cluster=True, return_distances=False): ''' holder should be a dictionary with df's as values and fp-filenames as keys labels should be a mapping of DrugCombID: ATC_class ''' if only_pca: from sklearn.decomposition import PCA df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) df_cluster = st(df_cluster) else: classes = df_cluster.index.copy() pca = PCA(n_components=2) temp = pca.fit_transform(df_cluster) df[ind] = pd.DataFrame(index=df_cluster.index, data=temp) df[ind]['classes'] = classes df[ind]['classes'] = df[ind]['classes'].map(labels) title = 'PCA' else: # to LDA from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA from sklearn.preprocessing import LabelEncoder # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366 df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st from sklearn.preprocessing import MinMaxScaler classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) mms = MinMaxScaler() df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster), index=df_cluster.index, columns=df.columns) else: classes = df_cluster.index.copy() df_cluster['classes'] = classes df_cluster['classes'] = df_cluster['classes'].map(labels) if binarize_class: df_cluster.loc[df_cluster.classes != class_name, 'classes'] = 'not ' + 'class_name' # change labels from str to int enc = LabelEncoder() real_classes = df_cluster.loc[:, 'classes'] df_cluster.loc[:, 'classes'] = enc.fit_transform( df_cluster['classes']) classes = df_cluster.pop('classes') if preprocess_lda == 'PLS': from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10, scale=False) temp = pls.fit_transform(df_cluster.values, classes.values)[0] elif preprocess_lda == 'PCA': from sklearn.decomposition import PCA pca = PCA(n_components=0.95, svd_solver='full', whiten=False) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'kernelPCA': from sklearn.decomposition import KernelPCA pca = KernelPCA(kernel="rbf", gamma=5) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'NONE': temp = df_cluster.values # lda lda = LDA(n_discriminants=2) lda.fit(temp, classes.values) temp = lda.transform(temp) with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'Casting complex values to real discards the imaginary part' ) temp = temp.astype(np.float) # in case of complex numbers/// df[ind] = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp) df[ind]['classes'] = real_classes title = 'LDA' sns.set_context(context='talk') sns.set_style('dark') sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']}) fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14)) cm = plt.cm.get_cmap('Spectral') my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))), alpha=0.6) if return_distances: distances = dict() sil_scores = dict() chs_scores = dict() for ax_n, key, x, name in zip( [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(), [ 'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16', 'Trans_1024', 'Trans_64', 'GAE_64' ]): if not binarize_class: for ind, i in enumerate(np.unique(x['classes'])): color = my_cmap[ind] marker = '.' if i == class_name: color = 'black', marker = ',' ax_n.scatter( x.loc[x.classes == i, 0], x.loc[x.classes == i, 1], marker=marker, label=i + f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})', color=color) ax_n.title.set_text(name) else: ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.') ax_n.scatter( x.loc[x.classes == class_name, 0], x.loc[x.classes == class_name, 1], marker=',', label=class_name + f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})', color='darkorange') ax_n.title.set_text(name) if cluster: from sklearn.cluster import KMeans from scipy.spatial.distance import pdist from sklearn.metrics import silhouette_score as sil from sklearn.metrics import calinski_harabasz_score as chs km = KMeans(init='k-means++', n_clusters=1, n_init=10) km.fit(x.loc[x.classes != class_name, [0, 1]]) km1 = KMeans(init='k-means++', n_clusters=1, n_init=10) km1.fit(x.loc[x.classes == class_name, [0, 1]]) ax_n.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='X', color='darkblue', s=100, linewidth=3) ax_n.scatter(km1.cluster_centers_[:, 0], km1.cluster_centers_[:, 1], marker='X', color='red', s=100, linewidth=3) d = round( pdist([km.cluster_centers_[0], km1.cluster_centers_[0]], metric='euclidean')[0], 3) d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3) d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3) if return_distances: cl_name = class_name + ' ' + name distances[cl_name] = d sil_scores[cl_name] = d_sc chs_scores[cl_name] = d_chs name = name + '\n|d:' + str(d) + '|sil:' + str( d_sc) + '|chs:' + str(d_chs) ax_n.title.set_text(name) for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]: ax.set_xticks([]) ax.set_yticks([]) labels = ax_n.get_legend_handles_labels()[1] if only_pca: fig.suptitle(labels[0] + "\n classified with: " + title) else: fig.suptitle(labels[0] + "\n classified with: " + title + f', preprocessed with: {preprocess_lda}') fig.tight_layout() if not return_distances: return fig else: return fig, distances, sil_scores, chs_scores
def plot_single_projection(holder, labels, class_name='Antioxidants', fp_name='fps_e3fp_1024bit', standardize=True, preprocess_lda='PCA'): ''' holder should be a dictionary with df's as values and fp-filenames as keys labels should be a mapping of DrugCombID: ATC_class ''' from mlxtend.preprocessing import standardize as st from sklearn.preprocessing import LabelEncoder from sklearn.cluster import KMeans from mlxtend.feature_extraction import LinearDiscriminantAnalysis #in sklearn LDA i'd need to add a dummy class if i want to have 2 components after trasnformation from scipy.spatial.distance import pdist df_cluster = holder[fp_name].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) df_cluster = st(df_cluster) else: classes = df_cluster.index.copy() df_cluster[ 'classes'] = classes # our classes are mapped to index in labels dictionary df_cluster['classes'] = df_cluster['classes'].map(labels) df_cluster.loc[df_cluster.classes != class_name, 'classes'] = 'not ' + 'class_name' #dummy = [0]*(df_cluster.shape[1]-1) + ['dummy'] #df_cluster.loc[df_cluster.shape[0]] = dummy # change labels from str to int enc = LabelEncoder() real_classes = df_cluster.loc[:, 'classes'] df_cluster.loc[:, 'classes'] = enc.fit_transform(df_cluster['classes']) classes = df_cluster.pop('classes') if preprocess_lda == 'PLS': from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10, scale=False) temp = pls.fit_transform(df_cluster.values, classes.values)[0] elif preprocess_lda == 'PCA': from sklearn.decomposition import PCA pca = PCA(n_components=0.95, svd_solver='full', whiten=False) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'kernelPCA': from sklearn.decomposition import KernelPCA pca = KernelPCA(kernel="rbf", gamma=5) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'NONE': temp = df_cluster.values elif preprocess_lda == 'NCA': from sklearn.neighbors import NeighborhoodComponentsAnalysis nca = NeighborhoodComponentsAnalysis() temp = nca.fit_transform(df_cluster.values, classes.values) #lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto') #lda.fit(temp, classes.values) #temp1 = lda.transform(temp) lda = LinearDiscriminantAnalysis(n_discriminants=2) lda.fit(temp, classes.values) temp = lda.transform(temp) with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'Casting complex values to real discards the imaginary part') temp = temp.astype(np.float) # in case of complex numbers/// df = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp) df['classes'] = real_classes km = KMeans(init='k-means++', n_clusters=1, n_init=10) km.fit(df.loc[df.classes != class_name, [0, 1]]) km1 = KMeans(init='k-means++', n_clusters=1, n_init=10) km1.fit(df.loc[df.classes == class_name, [0, 1]]) d = pdist([km.cluster_centers_[0], km1.cluster_centers_[0]]) d = str(round(d[0], 3)) fig, ax = plt.subplots(figsize=(6, 6)) ax.scatter(df.loc[df.classes != class_name, 0], df.loc[df.classes != class_name, 1], marker=',', color='grey') ax.scatter(df.loc[df.classes == class_name, 0], df.loc[df.classes == class_name, 1], marker=',', color='orange') ax.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='X', color='green', linewidths=30) ax.scatter(km1.cluster_centers_[:, 0], km1.cluster_centers_[:, 1], marker='X', color='red', linewidths=30) fig.suptitle(class_name + ' ' + d) return fig
Axes3D(plt.figure()).scatter(Xiso[:,0],Xiso[:,1], alpha=.3) #%% t-SNE tsne = TSNE(n_components=2, n_iter=250) Xtsne = tsne.fit_transform(X[:500,:200]) Axes3D(plt.figure()).scatter(Xtsne[:,0],Xtsne[:,1], alpha=.3) #%% PC Regression lin_reg = LinearRegression() scores = cross_val_score(lin_reg, X95[:,:10], Y) scores.mean() #%% Partial Least Squares from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10) Xpls, Ypls = pls.fit_transform(X,Y) #%% Visualization with labeling import ggplot as gg df1['x1'], df1['x2'] = Xpca[:,0],Xpca[:,1] chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \ + gg.geom_point(size=10, alpha=.8) chart.show() #%% PLS transformation df1['x1'], df1['x2'] = Xpls[:,0],Xpls[:,1] chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \ + gg.geom_point(size=10, alpha=.8) chart.show() #%% Feature Selection with Elastic Net scaler = StandardScaler() Xscale = scaler.fit_transform(X)
def Granger_Causality_Pred(num): # loading dataset df = pd.read_csv("Y:\\Dropbox\\Dropbox (MIT)\\Robinhood Trading\\Stock Data\\broader_stock.csv") df = df.set_index(pd.to_datetime(df['Date'])) df.drop(['Date'], axis=1, inplace=True) pct_df = df.pct_change().shift(1).iloc[2:] # set up global variables leader_tick_dict = {} perf_res = {} w_mktre = (1 + pct_df['SPY_Close']).resample('W').prod() - 1 _ = 0 # identify leaders for tick in pct_df.columns[::3][(num * 100):((num + 1) * 100)]: # picking leaders for each stocks target_arr = pct_df[tick].dropna() w_target = (1 + target_arr).resample('W').prod() - 1 Y = w_target.shift(-1) leader_set = [] for leader in pct_df.columns[::3]: if leader != tick: leader_arr = pct_df[leader].dropna() w_leader = (1 + leader_arr).resample('W').prod() - 1 tempreg_dta = pd.concat([Y, w_target, w_mktre, w_leader], axis=1).dropna() tempreg_dta.columns = ['Y', 'Y-1', 'Mkt', 'Lead'] if tempreg_dta.shape[0] >= 36 * 4: ols = sm.OLS(tempreg_dta['Y'].iloc[-36 * 4:], sm.add_constant(tempreg_dta[['Y-1', 'Mkt', 'Lead']].iloc[-36 * 4:])) res = ols.fit(cov_type='HC0') leader_sig = res.pvalues[3] elif tempreg_dta.shape[0] >= 12 * 4: ols = sm.OLS(tempreg_dta['Y'].iloc[-12 * 4:], sm.add_constant(tempreg_dta[['Y-1', 'Mkt', 'Lead']].iloc[-12 * 4:])) res = ols.fit(cov_type='HC0') leader_sig = res.pvalues[3] else: leader_sig = 1 if leader_sig <= 1e-3: leader_set.append(leader) leader_tick_dict[tick] = leader_set # evaluate performance leader = leader_tick_dict[tick] if len(leader) > 1: # simple average avg_signal = ((1 + pct_df[leader_tick_dict[tick]]).resample('W').prod() - 1).mean(axis=1) # only evaluate at short term period val_avg = pd.concat([w_target.shift(1).iloc[-12 * 4:], avg_signal.iloc[-12 * 4:]], axis=1).dropna().values # metrics mu_avg = mean_squared_error(val_avg[:, 0], val_avg[:, 1]) * 100 acc_avg = accuracy_score((val_avg[:, 0] > 0).astype(int), (val_avg[:, 1] > 0).astype(int)) perf_res[tick] = [mu_avg, acc_avg] _ += 1 print("{}/100".format(_)) perf_pls = {} N = len(leader_tick_dict.keys()) count = 0 for tick in leader_tick_dict.keys(): leader = leader_tick_dict[tick] if len(leader) > 1: leader_arr = df[leader_tick_dict[tick]] target_arr = pct_df[tick].dropna() w_target = (1 + target_arr).resample('W').prod() - 1 pls_set = [] for col in leader_arr.columns: ind_arr = [] for t in range(leader_arr.shape[0] - 300, leader_arr.shape[0]): macd = MACD(leader_arr[col], 5, t) booling = BoolingerBands(leader_arr[col], 5, t) volcof = Vol_Coefficient(leader_arr[col], 5, t) anvol = AnnVol(leader_arr[col], 5, t) phl = Price_High_Low(leader_arr[col], 5, t) prev = PriceReverse(leader_arr[col], 5, t) # rsi = RelativeStrengh(leader_arr[col], 5, t) ind_arr.append([macd, booling, volcof, anvol, phl, prev]) w_X = pd.DataFrame(data=ind_arr, index=leader_arr.index[-300:]).resample('W').mean() temp_dta = pd.concat([w_target, w_X], axis=1).dropna().values[-12 * 4:, ] pls = PLSRegression(n_components=1) pls_x = pls.fit_transform(X=temp_dta[:, 1:], y=temp_dta[:, :1])[0] pls_set.append(pls_x) pls_X = np.column_stack(pls_set) signal = np.mean(pls_X, axis=1) actual = w_target.iloc[-49:-1] mu_pls = mean_squared_error(actual, signal) * 100 acc_pls = accuracy_score((actual > 0).astype(int), (signal > 0).astype(int)) perf_pls[tick] = [mu_pls, acc_pls] count += 1 print("{}/{}".format(count, N)) avg_res = pd.DataFrame(perf_res).T pls_res = pd.DataFrame(perf_pls).T ttl_res = pd.concat([pls_res, avg_res], axis=1) ttl_res.columns = ['MSE_PLS', 'ACC_PLS', 'MSE_AVG', 'ACC_AVG'] ttl_res.to_csv('Granger_Causality_Res%s.csv' % num)
# -----PLS testing-------------------------------------------------------------------------------------------------- if __name__ == '__main__': path_to_data = os.path.join(str(Path.home()), 'Deformetrica', 'deterministic_atlas_ct', 'output_separate_tmp10_def10_prttpe13_corrected', 'Decomposition') data_filename = 'Momenta_Table.csv' data, target = load_iris(return_X_y=True) data = data[0:80, 0:3] target = target[0:80] pls = PLSBinaryClassification(dataset_filename=data_filename, dataset_path=path_to_data, X=data, y=target) pls.decompose_with_pls(method='da') plsr = PLSRegression(3, scale=False) x_plsr, y_plsr = plsr.fit_transform(pls.X_centered, pls.y) plt.scatter(plsr.x_scores_[pls.y == 1, 0], plsr.x_scores_[pls.y == 1, 1], c='red', marker='d') plt.scatter(plsr.x_scores_[pls.y == -1, 0], plsr.x_scores_[pls.y == -1, 1], c='blue', marker='x') x = np.linspace(-2, 2, 100) print('W:\n {}'.format(pls.W)) print('xw:\n {}'.format(plsr.x_weights_)) print('T:\n {}'.format(pls.T)) print('Xload:\n {}'.format(plsr.x_loadings_.T @ plsr.x_loadings_)) print('P:\n {}'.format(pls.P)) print('q:\n {}'.format(pls.q)) print('----------------') print('yload:\n {}'.format(plsr.y_loadings_))
x_axis = np.arange(1, np.linalg.matrix_rank(X) + 1) plt.scatter(x_axis, cummulative_variance_explained) plt.plot(x_axis, cummulative_variance_explained) plt.title("Scree Plot") plt.xlabel("Number of latent vectors used") plt.ylabel("Percentage of variance explained") plt.xticks(x_axis, x_axis) plt.yticks() plt.show() # compare to sklearn package results to verify accuracy import numpy as np np.set_printoptions(threshold=np.inf) from sklearn.cross_decomposition import PLSRegression from sklearn.preprocessing import StandardScaler import pandas as pd import matplotlib.pyplot as plt X = [[1, 5, 10], [2, 4, 8], [3, 4, 8], [4, 5, 10]] y = [41, 49, 69, 65] X = StandardScaler().fit_transform(X) # population stdev y = StandardScaler().fit_transform(y) # population stdev pls1 = PLSRegression(n_components=2) scores = pls1.fit_transform(X, y) T = pls1.x_scores_ W = pls1.x_weights_ P = pls1.y_loadings_ y_pred = pls1.predict(X)
plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig) plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig) #%% PLS2 lda = LDA() nComponents = np.arange(1,nFeatures,8) pls2Scores = np.zeros((2,np.alen(nComponents))) for i,n in enumerate(nComponents): pls2 = PLSRegression(n_components=n) pls2.fit(dataTrain,Ytrain) dataTrainT = pls2.transform(dataTrain) dataTestT = pls2.transform(dataTest) pls2Scores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest) pls2 = PLSRegression(n_components=2) xtPLS,yt = pls2.fit_transform(dataTrain,Ytrain) uPLS = pls2.x_weights_ #%% Canonical Correlation Analysis nComponents = np.arange(1,nClasses+1) cca = CCA(n_components=nClasses) cca.fit(dataTrain,Ytrain) dataTrainT = cca.transform(dataTrain) dataTestT = cca.transform(dataTest) ccaScores = np.zeros((2,np.alen(nComponents))) for i,n in enumerate(nComponents): ccaScores[:,i] = util.classify(dataTrainT[:,0:n],dataTestT[:,0:n],labelsTrain,labelsTest) #%% Linear Discriminant Analysis nComponents = np.arange(1,nClasses+1)
reduced_college_train_x = pcr_opt.transform(college_train_x) lrm = LinearRegression() lrm.fit(reduced_college_train_x, college_train_y) print "\nPCR RMSE (M = " + str(opt_m) + ")" print rmse(lrm, reduced_college_test_x, college_test_y) #%% PLS from sklearn.cross_decomposition import PLSRegression pls_components = range(1, 18) cv_pls = np.array([]) for m in pls_components: pls = PLSRegression(n_components=m) foo = np.transpose(college_train_x.get_values()) transformed_college_train_x = pls.fit_transform(college_train_x, college_train_y)[0] lrm = LinearRegression() pls_this_rmse = rmse_cv(LinearRegression(), transformed_college_train_x, college_train_y).mean() cv_pls = np.append(cv_pls, pls_this_rmse) min_m = pls_components[np.argmin(cv_pls)] cv_pls = pd.Series(cv_pls, index=pls_components) cv_pls.plot(title="PLSRegression Cross Validation") plt.xlabel("Number of Components (M)") plt.ylabel("Root Mean Square Error") if show_plots_flag: plt.show() best_pls = PLSRegression(n_components=min_m) transformed_college_train_x = best_pls.fit_transform(college_train_x,
plt.figure() for c, i, target_name in zip("rgb", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], target_names): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) plt.legend() plt.title('PCA of IRIS dataset') plt.axis('equal') plt.show() # PLS1 PLS1 = PLS(n_components=2) X = df.as_matrix()[:, :4] y = np.array(map(lambda x : number_map[x],df.as_matrix()[:, 4])) string_map = {-1.2206555615733703 : "Iris-setosa", 0 : "Iris-versicolor", 1.2206555615733703 : "Iris-virginica"} result = PLS1.fit_transform(X,y) y = np.array(map(lambda x : string_map[x],result[1])) target_names = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"] for c, i, target_name in zip("rgb", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], target_names): plt.scatter(result[0][y == i, 0],result[0][y == i, 1], c=c, label=target_name) plt.legend() plt.title('PLS1 of IRIS dataset') plt.axis('equal') plt.show() # PLS2 PLS2 = PLS(n_components=2) X = df.as_matrix()[:, :4] y = np.array(map(lambda x : number_map[x],df.as_matrix()[:, 4])) one_hot_y = np.zeros((len(y),3))
# Make predictions using an SVM with PCA and PLS pca_error = 0 pls_error = 0 n_folds = 10 svc = LinearSVC() for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds): X_train, X_test = X[train_inds], X[test_inds] y_train, y_test = y[train_inds], y[test_inds] # Use PCA and then classify using an SVM X_train2 = pca.fit_transform(X_train) X_test2 = pca.transform(X_test) svc.fit(X_train2, y_train) y_pred = svc.predict(X_test2) pca_error += zero_one_loss(y_test, y_pred) # Use PLS and then classify using an SVM X_train2, y_train2 = pls.fit_transform(X_train, y_train) X_test2 = pls.transform(X_test) svc.fit(X_train2, y_train) y_pred = svc.predict(X_test2) pls_error += zero_one_loss(y_test, y_pred) print(pca_error / n_folds) print(pls_error / n_folds)
plots = height*width # fig, axes = plt.subplots(height, width, figsize=(20, 20), sharex=True, sharey=True) colors = { 0: "g", 1: "r" } df = pd.read_json(DATA_PATH + "left_shoulder.json") df_features = pd.DataFrame(df.data.tolist()) plsr = PLSRegression(n_components=2) X = df_features y = df["label"] principal_components = plsr.fit_transform(X, y) principal_df = pd.DataFrame(data=principal_components[0], columns=["component 1", "component 2"]) principal_df = pd.concat([principal_df, df[["label"]]], axis=1) principal_df = pd.concat([principal_df, df[["id"]]], axis=1) # axes[i // 2][i % 2].plot(np.cumsum(pca.explained_variance_ratio_)) healthy_id = list(set(principal_df.loc[df["label"] == 0]["id"])) impaired_id = list(set(principal_df.loc[df["label"] == 1]["id"])) healthy_ids = np.random.choice(healthy_id, plots//2) impaired_ids = np.random.choice(impaired_id, plots//2) all_ids = np.append(healthy_ids, impaired_ids) # for i in range(len(all_ids)):