def lda_training(X, y): """ Linear Discriminant Analysis model training. Estimates the test error and computes the training error. :param X: :param y: :return: """ estimator = LinearDiscriminantAnalysis() estimated_test_error = estimate_test_error(estimator, X, y) print("Estimated test error for model {} :\n\t{}".format( estimator.get_params(), estimated_test_error)) current_model = LinearDiscriminantAnalysis() current_model.fit(X, y) y_pred = current_model.predict(X) error = roc_auc_score(y, y_pred) print("Training error for model {} :\n\t{}".format( current_model.get_params(), error)) return current_model
def iris_data(): # define dataset #X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, random_state=1) X, y = load_iris(True) print(X, y) # define model model = LinearDiscriminantAnalysis() print(model.get_params(True)) #simple_prediction(X, y, model) solvers = ["svd", "lsqr", "eigen"] row = [0.1, 3.5, 4.2, 100] for solver in solvers: result = hyper_parameters(X, y, model, solver) pr_class = result.predict([[10, 25, 30, 40]]) print(pr_class)
def LDA_as_reduction(): X, y = make_classification(n_samples=10, n_features=6, n_informative=6, n_redundant=0, random_state=2, n_classes=3) model = LinearDiscriminantAnalysis(n_components=2) print(X, y) print(model.get_params(True)) model.fit(X, y) print(model.predict([[2, 4, 5, -1, 0, 4]])) X_trans = model.transform(X) print(X_trans) model.fit(X_trans, y) print(model.predict([[2, 4]])) print(model.predict([[-3, 5]]))
line = ax.plot(xx, yy, color='black', linewidth=2) plt.scatter(xp1, yp1, color='red', s=3) plt.scatter(xp2, yp2, color='green', s=3) plt.xlabel('x') plt.ylabel('y') plt.title('Fisher Discriminant Analysis') plt.show() print('Accuracy Score of Fisher Discriminant Analysis: %f' % (accuracy_score(y_test, pred))) # LDA方法 LDA = LinearDiscriminantAnalysis() LDA.fit(X_train, y_train) pred = [] pred = LDA.predict(X_test) para = LDA.get_params() w1 = np.dot(np.linalg.inv(cov_m), (avg1 - avg2).T) w0 = -0.5 * np.dot((avg1 + avg2).T, w1) xx = np.arange(-10, 40, 0.01) yy = -xx * w1[0] / w1[1] - w0 # 预测结果为第一类的点 xp1 = X_test.iloc[list(np.where(np.array(pred) == 1)[0]), 0] yp1 = X_test.iloc[list(np.where(np.array(pred) == 1)[0]), 1] # 预测结果为第二类的点 xp2 = X_test.iloc[list(np.where(np.array(pred) == 2)[0]), 0] yp2 = X_test.iloc[list(np.where(np.array(pred) == 2)[0]), 1]
y = diagnostic[:trainingSetLength, 1:] # target values (i.e. expected output for X) for i in range(len(y)): y[i] = int(y[i]) y = np.transpose(y).astype('int') trainingSet = extractedFeatures[:trainingSetLength] lda = LinearDiscriminantAnalysis() lda.fit(trainingSet, y[0]) # letting the algorithm know which sample in X belongs to which class labelled in y # save the params to disk lda_params = lda.get_params() params_lda = 'params_lda.sav' # save the model to disk filename_lda = 'lda_model.sav' #testSet=extractedFeatures[trainingSetLength:trainingSetLength+10] #prediction=lda.predict(testSet) pickle.dump(lda, open(filename_lda, 'wb')) pickle.dump(lda_params, open(params_lda, 'wb')) #%%TEST CLASSIFICATION - QDA excelAddress = 'C:\\Users\\theor\\Downloads\\Ground_truth_ISIC_1.xlsx' trainingSetLength = 500
x_nd = d_no_dummies.drop(['salary_bin'], axis=1).values.astype(float) x_numeric = d_no_dummies[['age', 'fnlwgt', 'education_nbr', 'capital_gain', 'capital_loss', 'hours_per_week']].values.astype(float) #Split data into training and test sets - be sure to stratify since this is for classification x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=seed) x_nd_train, x_nd_test, y_nd_train, y_nd_test = train_test_split(x_nd, y, test_size=0.3, stratify=y, random_state=seed) xn_train, xn_test, yn_train, yn_test = train_test_split(x_numeric, y, test_size=0.3, stratify=y, random_state=seed) ###### #LDA #Run LDA for classification #Note if n_components=None, then all of them are kept lda = LinearDiscriminantAnalysis(n_components=None, solver='svd') lda.fit(x_train, y_train) print(lda.get_params()) print('Priors:', lda.priors_) #Class prior probabilities print('Classification Accuracy:', lda.score(x_train, y_train)) #Explore the percentage of between class variance explained by each linear discriminant print('Explained variance:', lda.explained_variance_ratio_) ###### #Evaluating the model on new data #Make income predictions for validation set post_lda = lda.predict(x_test) post_lda = post_lda.reshape(post_lda.shape[0], 1) print('Classification Accuracy:', lda.score(x_test, y_test)) #Confusion matrix
#ETAPA XX: APLICAÇÃO DO LDA #Bloco 01: Aplicação do LDA #Variance Caused by Each of the Principal Components from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA() lda.get_params().keys() X_train_lda_new = lda.fit_transform(X_train_lda, Y_train) X_test_lda_new = lda.transform(X_test_lda) print('Original Number of Features:', X_train_lda.shape[1]) print('Reduced Number of Features:', X_train_lda_new.shape[1]) print('Original Number of Features:', X_test_lda.shape[1]) print('Reduced Number of Features:', X_test_lda_new.shape[1]) from pprint import pprint print('Parameters Currently In Use:\n') pprint(lda.get_params()) explained_variance_lda = lda.explained_variance_ratio_ for i in explained_variance_pca: print(format(i*100, 'f')) plt.figure(1, figsize = (14, 7))
triples = np.zeros((len(scaled_df_x), len(scaled_df_x), 3)) triple_array = [] labeled_array = [] for i in range(len(scaled_df_x)): for j in range(len(scaled_df_x)): triples[i, j, 0] = scaled_df_x[i][j] triples[i, j, 1] = scaled_df_y[i][j] triples[i, j, 2] = scaled_df_z[i][j] triple_array.append( [scaled_df_x[i][j], scaled_df_y[i][j], scaled_df_z[i][j]]) labeled_array.append(labeled_mat[i][j]) clf = LinearDiscriminantAnalysis(store_covariance=True) clf.fit(triple_array, labeled_array) plt.figure() score = clf.score(triple_array, labeled_array) params = clf.get_params() print "Accuracy:", score print "coef:", clf.coef_ print "Covariance matrix:", clf.covariance_ print "Explained Variance Ratio:", clf.explained_variance_ratio_ print "Means:", clf.means_ print params pair_set = set() for i in range(len(clf.coef_)): for j in range(len(clf.coef_)): if (i, j) not in pair_set and (j, i) not in pair_set and i != j: pair_set.add((i, j)) x_difference = (clf.coef_[i][0] - clf.coef_[j][0])**2 y_difference = (clf.coef_[i][1] - clf.coef_[j][1])**2 z_difference = (clf.coef_[i][2] - clf.coef_[j][2])**2
recall = cross_val_score(model, X_train, y_train, cv=kfold, scoring='recall').mean() f1_score = cross_val_score(model, X, y, cv=kfold, scoring='f1_weighted').mean() auc_score = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc').mean() delta = time.time() - start_time print('{}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f} secs'.format( name, accuracy, precision, recall, f1_score, auc_score, delta)) clf = LinearDiscriminantAnalysis(store_covariance=True).fit(X_train, y_train) print(clf.get_params(deep=True)) print(X_train.head()) X2 = clf.transform(X) print(X2.head()) """ with open(input_file, "r") as fr: n = int(fr.readline()) for i in range(0, n): a, b = map(int, fr.readline().strip().split()) print(a, b) output_file = "D:\\Container\\Python Projects\\2020-09-30 Yandex ML Contest\\output_A.txt" if (yandex): output_file = "output.txt" with open(output_file, "w") as fw: fw.write(str.format("{0:.6f}", a))
my_ground_truth == my_pred) # In[103]: #Let's use Sklearn to see if our solution is correct #Using sklearn from sklearn.discriminant_analysis import LinearDiscriminantAnalysis clf = LinearDiscriminantAnalysis() clf.fit(new_data[:, 1:], labels) LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None, solver='eigen', store_covariance=False, tol=0.0001) print(clf.get_params()) predictions = clf.predict(new_data[:, 1:]) print(predictions) errors = sum(labels != predictions) error_rate = (n_errors / len(predictions) * 100) print("Error Rate is: ", error_rate, "%") print("\nAs can be seen, our solution is right!") # 2. Consider the Logistic Regression as discussed in the class. Assume now that the cost of erring an observation from class 1 is cost1 and the cost of erring observations from class 0 is cost0. How would you modify the goal function, gradient and hessian matrix (slides 11 and 12 in week 5)? # # Change the code provided (or developed by you) in the class to receive as input the vector of costs. Test your code with the following script: # # trainC1 = mvnrnd([21 21], [1 0; 0 1], 1000); # # trainC0 = mvnrnd([23 23], [1 0; 0 1], 20); #
class LDA(object): def __init__(self, solver="svd", shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=1e-4): """ :param solver: string, 可选项,"svd","lsqr", "eigen"。 默认使用svd, 不计算协方差矩阵,适用于大量特征 的数据, 最小二乘 lsqr, 结合shrinkage 使用。 eigen 特征值分解, 集合shrinkage 使用 :param shrinkage: str/float 可选项,概率值,默认为None, "auto", 自动收缩, 0到1内的float, 固定的收缩参数 :param priors: array, optional, shape (n_classes,) 分类优先 :param n_components: # 分量数, 默认None, int, 可选项 :param store_covariance: bool, 可选项, 只用于”svd“ 额外计算分类协方差矩阵 :param tol: 浮点型,默认1e-4, 在svd 中,用于排序评估的阈值 """ self.model = LinearDiscriminantAnalysis( solver=solver, shrinkage=shrinkage, priors=priors, n_components=n_components, store_covariance=store_covariance, tol=tol) def fit(self, x, y): self.model.fit(X=x, y=y) def transform(self, x): return self.model.transform(X=x) def fit_transform(self, x, y): return self.model.fit_transform(X=x, y=y) def get_params(self, deep=True): return self.model.get_params(deep=deep) def set_params(self, **params): self.model.set_params(**params) def decision_function(self, x): self.model.decision_function(X=x) def predict(self, x): return self.model.predict(X=x) def predict_log_proba(self, x): return self.model.predict_log_proba(X=x) def predict_proba(self, x): return self.model.predict_proba(X=x) def score(self, x, y, sample_weight): return self.model.score(X=x, y=y, sample_weight=sample_weight) def get_attributes(self): # 生成模型之后才能获取相关属性值 coef = self.model.coef_ # 权重向量, intercept = self.model.intercept_ # 截距项 covariance = self.model.covariance_ # 协方差矩阵 explained_variance_ratio = self.model.explained_variance_ratio_ means = self.model.means_ priors = self.model.priors_ # 分类等级, 求和为1 shape (n_classes) scalings = self.model.scalings_ # shape(rank,n_classes-1). 缩放 xbar = self.model.xbar_ # 所有的均值 classes = self.model.classes_ # 分类标签 return coef, intercept, covariance, explained_variance_ratio, means, priors, scalings, xbar, classes