示例#1
0
    def xai_feature(self, samp_num, option='None'):
        """extract the important features from the input data
        Arg:
            fea_num: number of features that needed by the user
            samp_num: number of data used for explanation
        return:
            fea: extracted features
        """
        #print '----------------------------------------------------'
        #print "parameters:"
        #print "data:",self.data
        #print "data shape:", self.data.shape
        #print "seq_len:", self.seq_len
        #print "start:", self.start
        #print "sp:", self.sp
        #print "real_sp:", self.real_sp
        #print "pred:", self.pred
        #print "trunc_len:", self.tl
        #print "trunc_data",self.trunc_data
        #print "trunc_data_test", self.trunc_data_test
        #print '----------------------------------------------------'
        cen = self.seq_len / 2
        half_tl = self.tl / 2
        sample = np.random.randint(1, self.tl + 1, samp_num)
        print "sample:", sample
        features_range = range(self.tl + 1)
        data_explain = np.copy(self.trunc_data).reshape(
            1, self.trunc_data.shape[0])
        #data_sampled = np.copy(self.data)
        data_sampled = np.copy(self.trunc_data_test)
        for i, size in enumerate(sample, start=1):
            inactive = np.random.choice(features_range, size, replace=False)
            #print '\ninactive --->',inactive
            tmp_sampled = np.copy(self.trunc_data)
            tmp_sampled[inactive] = 0
            #tmp_sampled[inactive] = np.random.choice(range(257), size, replace = False)
            #print "trunc_data.shape", self.trunc_data.shape
            tmp_sampled = tmp_sampled.reshape(1, self.trunc_data.shape[0])
            data_explain = np.concatenate((data_explain, tmp_sampled), axis=0)
            #print "data_explain.shape", data_explain.shape
            data_sampled_mutate = np.copy(self.data)
            if self.real_sp < half_tl:
                data_sampled_mutate[0, 0:tmp_sampled.shape[1]] = tmp_sampled
            elif self.real_sp >= self.seq_len - half_tl:
                data_sampled_mutate[0, (
                    self.seq_len -
                    tmp_sampled.shape[1]):self.seq_len] = tmp_sampled
            else:
                data_sampled_mutate[0, (self.real_sp -
                                        half_tl):(self.real_sp + half_tl +
                                                  1)] = tmp_sampled
            data_sampled = np.concatenate((data_sampled, data_sampled_mutate),
                                          axis=0)

        if option == "Fixed":
            print "Fix start points"
            data_sampled[:, self.real_sp] = self.start
        label_sampled = self.model.predict(data_sampled,
                                           verbose=0)[:, self.real_sp, 1]
        label_sampled = label_sampled.reshape(label_sampled.shape[0], 1)
        #X = r.matrix(data_explain, nrow = data_explain.shape[0], ncol = data_explain.shape[1])
        #Y = r.matrix(label_sampled, nrow = label_sampled.shape[0], ncol = label_sampled.shape[1])

        #n = r.nrow(X)
        #print "n:", n
        #p = r.ncol(X)
        #print "p:", p
        #print "np.sqrt(n*np.log(p)):", np.sqrt(n*np.log(p))

        #print "X_shape", X.dim
        #print "Y_shape", Y.dim

        # Mixture model fitting
        GMM = GaussianMixture(n_components=2).fit(data_explain)
        #print GMM.converged_

        means = GMM.means_
        covariances = GMM.covariances_
        r_ik = np.zeros((samp_num + 1, 2))

        k = -1
        for m, c in zip(means, covariances):
            #if np.linalg.matrix_rank(c) != c.shape[0]:
            #break
            k += 1
            reg_cov = 5e-5 * np.identity(self.tl + 1)
            c = c + reg_cov
            #print "C:", c
            multi_normal = multivariate_normal(mean=m, cov=c)
            r_ik[:, k] = GMM.weights_[k] * multi_normal.pdf(data_explain)
        #if k == 5:
        #mat_norm = np.zeros((501,501))
        #np.fill_diagonal(mat_norm, 1/np.sum(r_ik,axis=1))
        #P = mat_norm.dot(r_ik)
        res = np.argmax(r_ik, axis=1)

        # find the index for the best component
        best_component_idx = res[0]

        # fitting beta according to best component of mixture regression model

        # get the data for this component
        idx = np.where(res == best_component_idx)[0]
        X = r.matrix(data_explain[idx], nrow=len(idx), ncol=self.tl + 1)
        Y = r.matrix(label_sampled[idx], nrow=len(idx), ncol=1)
        #else:
        #    X = r.matrix(data_explain, nrow = 501, ncol = 41)
        #    Y = r.matrix(label_sampled, nrow = 501, ncol = 1)

        n = r.nrow(X)
        #print "n:", n
        p = r.ncol(X)
        #print "p:", p
        #print "np.sqrt(n*np.log(p)):", np.sqrt(n*np.log(p))

        # solve fused lasso by r library and get the importance score from the results
        #print "X_shape", X.dim
        #print "Y_shape", Y.dim
        results = r.fusedlasso1d(y=Y, X=X)
        #print "result_i", result_i
        result = np.array(r.coef(results, np.sqrt(n * np.log(p)))[0])[:, -1]
        #print "result:", result

        #results = r.fusedlasso1d(y=Y,X=X)
        #result = np.array(r.coef(results, np.sqrt(n*np.log(p)))[0])[:,-1]

        # sorting the importance_score and return the important features
        importance_score = np.argsort(result)[::-1]
        print 'importance_score ...', importance_score
        self.fea = (importance_score - self.tl / 2) + self.real_sp
        self.fea = self.fea[np.where(self.fea < 200)]
        self.fea = self.fea[np.where(self.fea >= 0)]
        print 'self.fea ...', self.fea
        return self.fea
#
plot_pca_2d(x_pca,yhat)
plot_pca_3d(x_pca,yhat,elevacao,azimute)
plot_componentes(df_pca_componentes)

#
# salvando em um dataframe/csv o resultado do modelo
#
df_algoritimos["dbscan"]=yhat

#
# Execução do Gaussian Mixture com k ajustado pelo slider
#
st.title("Gaussian Mixture")

modelo = GaussianMixture(n_components=k,random_state=42, covariance_type="spherical")
modelo.fit(x)
yhat = modelo.predict(x)

#
# Exibição dos gráficos 2D e 3D e componentes
#
plot_pca_2d(x_pca,yhat)
plot_pca_3d(x_pca,yhat,elevacao,azimute)
plot_componentes(df_pca_componentes)

#
# salvando em um dataframe/csv o resultado do modelo
#
df_algoritimos["GMM"]=yhat
示例#3
0
def eval_train(model, all_loss, epoch):
    model.eval()
    losses = torch.zeros(50000)
    Prob_Output = []
    with torch.no_grad():
        for batch_idx, (inputs, targets, index) in enumerate(eval_loader):
            inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs)
            #训练的时候减去一个数字,测试的时候直接输出这个数字
            '''if args.noise_mode == 'asym_two_unbalanced_classes':
                loss, outputs = LDAM_DRE_warmup(outputs,targets)
                Prob_Output_temp = nn.functional.softmax(outputs, dim=1)
                Prob_Output.append(Prob_Output_temp)
            else:
                Prob_Output_temp = nn.functional.softmax(outputs, dim=1)
                Prob_Output.append(Prob_Output_temp)
                loss = CE(outputs, targets)  
            '''
            Prob_Output_temp = nn.functional.softmax(1 * outputs,
                                                     dim=1)  #预测的时候不需要正则
            Prob_Output.append(Prob_Output_temp)

            if args.noise_mode == 'asym_two_unbalanced_classes':
                if args.LDAM_DRW:
                    weight = get_per_cls_wei(epoch)
                    loss, _ = LDAM_DRE(
                        outputs, targets, weight, reduction='none'
                    )  #warm up 与coguss cofinetune 后的 度量loss small loss trick
                else:
                    loss = CE(outputs, targets)
            else:
                loss = CE(outputs, targets)

            #loss = CE(outputs, targets)

            for b in range(inputs.size(0)):
                #print("b=", b)
                #print("index[b]=", index[b])
                losses[index[b]] = loss[b]  #index[b]可以将每一个loss对应到每一个样本
    #Prob_Output_tensor = Prob_Output[0]
    #for i in range(1, len(Prob_Output)):
    Prob_Output_tensor = torch.cat(tuple(Prob_Output), dim=0)

    losses = (losses - losses.min()) / (losses.max() - losses.min())
    all_loss.append(losses)

    if args.r == 0.9:  # average loss over last 5 epochs to improve convergence stability
        history = torch.stack(all_loss)
        input_loss = history[-5:].mean(0)
        input_loss = input_loss.reshape(-1, 1)
    else:
        input_loss = losses.reshape(-1, 1)

    # fit a two-component GMM to the loss
    gmm = GaussianMixture(n_components=2,
                          max_iter=10,
                          tol=1e-2,
                          reg_covar=5e-4)
    gmm.fit(input_loss)
    prob = gmm.predict_proba(input_loss)
    #print("prob1=", prob)
    #print("gmm.means_.argmin()=", gmm.means_.argmin())
    prob = prob[:, gmm.means_.argmin(
    )]  #属于小loss的概率是多少 获得的是真实无噪声样本的概率是多少 该样本无噪声的概率是多少
    #index_temp = np.argmax(prob, axis=1)
    #print("index_temp=", index_temp)
    #for i in range(len(prob)):
    #    prob[i] = prob[i, index_temp[i]]
    #prob = np.reshape(prob[:,0],newshape=(prob.shape[0]))
    #prob =  np.take(prob, index_temp, axis=1)
    #print("prob2=", prob)
    return prob, all_loss, Prob_Output_tensor
示例#4
0
def plain_em(training_data):
    n_components = np.arange(1, 21)
    models = [
        mixture.GaussianMixture(n, covariance_type='full',
                                random_state=RAND).fit(training_data)
        for n in n_components
    ]

    plt.plot(n_components, [m.bic(training_data) for m in models], label='BIC')
    plt.legend(loc='best')
    plt.xlabel('n_components')
    plt.show()

    print("Best Components Score is 17")

    # https://towardsdatascience.com/gaussian-mixture-model-clusterization-how-to-select-the-number-of-components-clusters-553bef45f6e4
    n_clusters = np.arange(2, 10)
    sils = []
    sils_err = []
    iterations = 20
    for n in n_clusters:
        tmp_sil = []
        for _ in range(iterations):
            print(n, len(tmp_sil))
            gmm = GaussianMixture(n, n_init=2)
            labels = gmm.fit_predict(X_train)
            sil = metrics.silhouette_score(X_train, labels, metric='euclidean')
            tmp_sil.append(sil)
        val = np.mean(np.array(tmp_sil))
        err = np.std(tmp_sil)
        sils.append(val)
        sils_err.append(err)
    plt.errorbar(n_clusters, sils, yerr=sils_err)
    plt.title("Silhouette Scores EM Churn", fontsize=20)
    plt.xticks(n_clusters)
    plt.xlabel("N. of clusters")
    plt.ylabel("Score")
    plt.show()

    n_clusters = np.arange(2, 10)
    iterations = 20
    results = []
    res_sigs = []
    for n in n_clusters:
        dist = []

        for iteration in range(iterations):
            train, test = train_test_split(X_train, test_size=0.5)

            gmm_train = GaussianMixture(n, n_init=2).fit(train)
            gmm_test = GaussianMixture(n, n_init=2).fit(test)
            dist.append(gmm_js(gmm_train, gmm_test))
        selec = SelBest(np.array(dist), int(iterations / 5))
        result = np.mean(selec)
        res_sig = np.std(selec)
        results.append(result)
        res_sigs.append(res_sig)

    plt.errorbar(n_clusters, results, yerr=res_sigs)
    plt.title("Distance between Train and Test GMMs", fontsize=20)
    plt.xticks(n_clusters)
    plt.xlabel("N. of clusters")
    plt.ylabel("Distance")
    plt.show()
示例#5
0
def GaussianModel(embeddings):
    gmm = GaussianMixture(n_components=1, reg_covar=1e-05)
    gmm.fit(embeddings)

    log_likelihood = gmm.score_samples(embeddings)
    return log_likelihood