def xai_feature(self, samp_num, option='None'): """extract the important features from the input data Arg: fea_num: number of features that needed by the user samp_num: number of data used for explanation return: fea: extracted features """ #print '----------------------------------------------------' #print "parameters:" #print "data:",self.data #print "data shape:", self.data.shape #print "seq_len:", self.seq_len #print "start:", self.start #print "sp:", self.sp #print "real_sp:", self.real_sp #print "pred:", self.pred #print "trunc_len:", self.tl #print "trunc_data",self.trunc_data #print "trunc_data_test", self.trunc_data_test #print '----------------------------------------------------' cen = self.seq_len / 2 half_tl = self.tl / 2 sample = np.random.randint(1, self.tl + 1, samp_num) print "sample:", sample features_range = range(self.tl + 1) data_explain = np.copy(self.trunc_data).reshape( 1, self.trunc_data.shape[0]) #data_sampled = np.copy(self.data) data_sampled = np.copy(self.trunc_data_test) for i, size in enumerate(sample, start=1): inactive = np.random.choice(features_range, size, replace=False) #print '\ninactive --->',inactive tmp_sampled = np.copy(self.trunc_data) tmp_sampled[inactive] = 0 #tmp_sampled[inactive] = np.random.choice(range(257), size, replace = False) #print "trunc_data.shape", self.trunc_data.shape tmp_sampled = tmp_sampled.reshape(1, self.trunc_data.shape[0]) data_explain = np.concatenate((data_explain, tmp_sampled), axis=0) #print "data_explain.shape", data_explain.shape data_sampled_mutate = np.copy(self.data) if self.real_sp < half_tl: data_sampled_mutate[0, 0:tmp_sampled.shape[1]] = tmp_sampled elif self.real_sp >= self.seq_len - half_tl: data_sampled_mutate[0, ( self.seq_len - tmp_sampled.shape[1]):self.seq_len] = tmp_sampled else: data_sampled_mutate[0, (self.real_sp - half_tl):(self.real_sp + half_tl + 1)] = tmp_sampled data_sampled = np.concatenate((data_sampled, data_sampled_mutate), axis=0) if option == "Fixed": print "Fix start points" data_sampled[:, self.real_sp] = self.start label_sampled = self.model.predict(data_sampled, verbose=0)[:, self.real_sp, 1] label_sampled = label_sampled.reshape(label_sampled.shape[0], 1) #X = r.matrix(data_explain, nrow = data_explain.shape[0], ncol = data_explain.shape[1]) #Y = r.matrix(label_sampled, nrow = label_sampled.shape[0], ncol = label_sampled.shape[1]) #n = r.nrow(X) #print "n:", n #p = r.ncol(X) #print "p:", p #print "np.sqrt(n*np.log(p)):", np.sqrt(n*np.log(p)) #print "X_shape", X.dim #print "Y_shape", Y.dim # Mixture model fitting GMM = GaussianMixture(n_components=2).fit(data_explain) #print GMM.converged_ means = GMM.means_ covariances = GMM.covariances_ r_ik = np.zeros((samp_num + 1, 2)) k = -1 for m, c in zip(means, covariances): #if np.linalg.matrix_rank(c) != c.shape[0]: #break k += 1 reg_cov = 5e-5 * np.identity(self.tl + 1) c = c + reg_cov #print "C:", c multi_normal = multivariate_normal(mean=m, cov=c) r_ik[:, k] = GMM.weights_[k] * multi_normal.pdf(data_explain) #if k == 5: #mat_norm = np.zeros((501,501)) #np.fill_diagonal(mat_norm, 1/np.sum(r_ik,axis=1)) #P = mat_norm.dot(r_ik) res = np.argmax(r_ik, axis=1) # find the index for the best component best_component_idx = res[0] # fitting beta according to best component of mixture regression model # get the data for this component idx = np.where(res == best_component_idx)[0] X = r.matrix(data_explain[idx], nrow=len(idx), ncol=self.tl + 1) Y = r.matrix(label_sampled[idx], nrow=len(idx), ncol=1) #else: # X = r.matrix(data_explain, nrow = 501, ncol = 41) # Y = r.matrix(label_sampled, nrow = 501, ncol = 1) n = r.nrow(X) #print "n:", n p = r.ncol(X) #print "p:", p #print "np.sqrt(n*np.log(p)):", np.sqrt(n*np.log(p)) # solve fused lasso by r library and get the importance score from the results #print "X_shape", X.dim #print "Y_shape", Y.dim results = r.fusedlasso1d(y=Y, X=X) #print "result_i", result_i result = np.array(r.coef(results, np.sqrt(n * np.log(p)))[0])[:, -1] #print "result:", result #results = r.fusedlasso1d(y=Y,X=X) #result = np.array(r.coef(results, np.sqrt(n*np.log(p)))[0])[:,-1] # sorting the importance_score and return the important features importance_score = np.argsort(result)[::-1] print 'importance_score ...', importance_score self.fea = (importance_score - self.tl / 2) + self.real_sp self.fea = self.fea[np.where(self.fea < 200)] self.fea = self.fea[np.where(self.fea >= 0)] print 'self.fea ...', self.fea return self.fea
# plot_pca_2d(x_pca,yhat) plot_pca_3d(x_pca,yhat,elevacao,azimute) plot_componentes(df_pca_componentes) # # salvando em um dataframe/csv o resultado do modelo # df_algoritimos["dbscan"]=yhat # # Execução do Gaussian Mixture com k ajustado pelo slider # st.title("Gaussian Mixture") modelo = GaussianMixture(n_components=k,random_state=42, covariance_type="spherical") modelo.fit(x) yhat = modelo.predict(x) # # Exibição dos gráficos 2D e 3D e componentes # plot_pca_2d(x_pca,yhat) plot_pca_3d(x_pca,yhat,elevacao,azimute) plot_componentes(df_pca_componentes) # # salvando em um dataframe/csv o resultado do modelo # df_algoritimos["GMM"]=yhat
def eval_train(model, all_loss, epoch): model.eval() losses = torch.zeros(50000) Prob_Output = [] with torch.no_grad(): for batch_idx, (inputs, targets, index) in enumerate(eval_loader): inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs) #训练的时候减去一个数字,测试的时候直接输出这个数字 '''if args.noise_mode == 'asym_two_unbalanced_classes': loss, outputs = LDAM_DRE_warmup(outputs,targets) Prob_Output_temp = nn.functional.softmax(outputs, dim=1) Prob_Output.append(Prob_Output_temp) else: Prob_Output_temp = nn.functional.softmax(outputs, dim=1) Prob_Output.append(Prob_Output_temp) loss = CE(outputs, targets) ''' Prob_Output_temp = nn.functional.softmax(1 * outputs, dim=1) #预测的时候不需要正则 Prob_Output.append(Prob_Output_temp) if args.noise_mode == 'asym_two_unbalanced_classes': if args.LDAM_DRW: weight = get_per_cls_wei(epoch) loss, _ = LDAM_DRE( outputs, targets, weight, reduction='none' ) #warm up 与coguss cofinetune 后的 度量loss small loss trick else: loss = CE(outputs, targets) else: loss = CE(outputs, targets) #loss = CE(outputs, targets) for b in range(inputs.size(0)): #print("b=", b) #print("index[b]=", index[b]) losses[index[b]] = loss[b] #index[b]可以将每一个loss对应到每一个样本 #Prob_Output_tensor = Prob_Output[0] #for i in range(1, len(Prob_Output)): Prob_Output_tensor = torch.cat(tuple(Prob_Output), dim=0) losses = (losses - losses.min()) / (losses.max() - losses.min()) all_loss.append(losses) if args.r == 0.9: # average loss over last 5 epochs to improve convergence stability history = torch.stack(all_loss) input_loss = history[-5:].mean(0) input_loss = input_loss.reshape(-1, 1) else: input_loss = losses.reshape(-1, 1) # fit a two-component GMM to the loss gmm = GaussianMixture(n_components=2, max_iter=10, tol=1e-2, reg_covar=5e-4) gmm.fit(input_loss) prob = gmm.predict_proba(input_loss) #print("prob1=", prob) #print("gmm.means_.argmin()=", gmm.means_.argmin()) prob = prob[:, gmm.means_.argmin( )] #属于小loss的概率是多少 获得的是真实无噪声样本的概率是多少 该样本无噪声的概率是多少 #index_temp = np.argmax(prob, axis=1) #print("index_temp=", index_temp) #for i in range(len(prob)): # prob[i] = prob[i, index_temp[i]] #prob = np.reshape(prob[:,0],newshape=(prob.shape[0])) #prob = np.take(prob, index_temp, axis=1) #print("prob2=", prob) return prob, all_loss, Prob_Output_tensor
def plain_em(training_data): n_components = np.arange(1, 21) models = [ mixture.GaussianMixture(n, covariance_type='full', random_state=RAND).fit(training_data) for n in n_components ] plt.plot(n_components, [m.bic(training_data) for m in models], label='BIC') plt.legend(loc='best') plt.xlabel('n_components') plt.show() print("Best Components Score is 17") # https://towardsdatascience.com/gaussian-mixture-model-clusterization-how-to-select-the-number-of-components-clusters-553bef45f6e4 n_clusters = np.arange(2, 10) sils = [] sils_err = [] iterations = 20 for n in n_clusters: tmp_sil = [] for _ in range(iterations): print(n, len(tmp_sil)) gmm = GaussianMixture(n, n_init=2) labels = gmm.fit_predict(X_train) sil = metrics.silhouette_score(X_train, labels, metric='euclidean') tmp_sil.append(sil) val = np.mean(np.array(tmp_sil)) err = np.std(tmp_sil) sils.append(val) sils_err.append(err) plt.errorbar(n_clusters, sils, yerr=sils_err) plt.title("Silhouette Scores EM Churn", fontsize=20) plt.xticks(n_clusters) plt.xlabel("N. of clusters") plt.ylabel("Score") plt.show() n_clusters = np.arange(2, 10) iterations = 20 results = [] res_sigs = [] for n in n_clusters: dist = [] for iteration in range(iterations): train, test = train_test_split(X_train, test_size=0.5) gmm_train = GaussianMixture(n, n_init=2).fit(train) gmm_test = GaussianMixture(n, n_init=2).fit(test) dist.append(gmm_js(gmm_train, gmm_test)) selec = SelBest(np.array(dist), int(iterations / 5)) result = np.mean(selec) res_sig = np.std(selec) results.append(result) res_sigs.append(res_sig) plt.errorbar(n_clusters, results, yerr=res_sigs) plt.title("Distance between Train and Test GMMs", fontsize=20) plt.xticks(n_clusters) plt.xlabel("N. of clusters") plt.ylabel("Distance") plt.show()
def GaussianModel(embeddings): gmm = GaussianMixture(n_components=1, reg_covar=1e-05) gmm.fit(embeddings) log_likelihood = gmm.score_samples(embeddings) return log_likelihood