示例#1
0
文件: mlp.py 项目: JakeMick/sk-mlp
def test_classification():
    from sklearn.datasets import load_digits
    from sklearn.cross_validation import KFold
    from sklearn.metrics import normalized_mutual_info_score
    digits = load_digits()
    X, y = digits.data, digits.target
    folds = 3
    cv = KFold(y.shape[0], folds)
    total = 0.0
    oo_score_bag = []
    for tr, te in cv:
        mlp = MLPClassifier(use_dropout=True, n_hidden=200, lr=1.)
        print(mlp)
        mlp.fit(X[tr], y[tr], max_epochs=100, staged_sample=X[te])
        t = normalized_mutual_info_score(mlp.predict(X[te]), y[te])
        print("Fold training accuracy: %f" % t)
        total += t
        this_score = []
        for i in mlp.oo_score:
            this_score.append(normalized_mutual_info_score(i, y[te]))
        oo_score_bag.append(this_score)
    from matplotlib import pyplot as plt
    plt.plot(oo_score_bag[0])
    plt.show()

    print("training accuracy: %f" % (total / float(folds)))
示例#2
0
def evaluate_label(A, H, W, corr, K):
    label = H.argmax(axis=1)
    km = KMeans(K)
    label2 = km.fit_predict(H)
    nmi = normalized_mutual_info_score(label, corr)
    nmi2 = normalized_mutual_info_score(label2, corr)
    print("NMI by argmax: " + str(nmi))
    print("NMI by kmeans: " + str(nmi2))
    A = np.matrix(A)
    W = np.matrix(W)
    H = np.matrix(H)
    loss = np.power(A - W * H.T, 2).sum()
    print(loss)
    return nmi, nmi2, loss
  def loss_augmented_fit(self, feat, y, loss_mult):
    """Fit K-Medoids to the provided data."""
    self._check_init_args()
    # Check that the array is good and attempt to convert it to
    # Numpy array if possible.
    feat = self._check_array(feat)
    # Apply distance metric to get the distance matrix.
    pdists = pairwise_distance_np(feat)

    num_data = feat.shape[0]
    candidate_ids = list(range(num_data))
    candidate_scores = np.zeros(num_data,)
    subset = []

    k = 0
    while k < self.n_clusters:
      candidate_scores = []
      for i in candidate_ids:
        # push i to subset.
        subset.append(i)
        marginal_cost = -1.0 * np.sum(np.min(pdists[:, subset], axis=1))
        loss = 1.0 - metrics.normalized_mutual_info_score(
            y, self._get_cluster_ics(pdists, subset))
        candidate_scores.append(marginal_cost + loss_mult * loss)
        # remove i from subset.
        subset.pop()

      # push i_star to subset.
      i_star = candidate_ids[np.argmax(candidate_scores)]
      subset.append(i_star)
      # remove i_star from candidate indices.
      candidate_ids.remove(i_star)
      k += 1

    # Expose labels_ which are the assignments of
    # the training data to clusters.
    self.labels_ = self._get_cluster_ics(pdists, subset)
    # Expose cluster centers, i.e. medoids.
    self.cluster_centers_ = feat.take(subset, axis=0)
    # Expose indices of chosen cluster centers.
    self.center_ics_ = subset
    # Expose the score = -\sum_{i \in V} min_{j \in S} || x_i - x_j ||
    self.score_ = np.float32(-1.0) * self._get_facility_distance(pdists, subset)
    self.score_aug_ = self.score_ + loss_mult * (
        1.0 - metrics.normalized_mutual_info_score(
            y, self._get_cluster_ics(pdists, subset)))
    self.score_aug_ = self.score_aug_.astype(np.float32)
    # Expose the chosen cluster indices.
    self.subset_ = subset
    return self
示例#4
0
    def get_normalized_mutual_info(standard_file, prediction_file, isjson=False, isint=False):
        """Get normalized mutual information (NMI) [Strehl2002]_.

        Parameters
        ----------
        standard_file   : str
            The ground truth or standard filename.
        prediction_file : str
            The analyzed or predicted filename.
        isjson          : bool
            The flag for standard_file.
        isint           : bool
            The flag for value in prediction_file.

        Returns
        -------
        normalized_mutual_info  : float
            Normalized mutual information score.

        References
        ----------
        .. [Strehl2002] Alexander Strehl and Joydeep Ghosh. Cluster ensembles A knowledge reuse framework
                        for combining multiple partitions. Journal of Machine Learning Research,
                        3(Dec):583-617, 2002.
        """
        if isjson:
            standard_data = AbstractionUtility.read_json(standard_file)
            standard_labels = standard_data.values()
        else:
            standard_labels = ExternalEvaluation.get_evaluated(standard_file)

        prediction_labels = ExternalEvaluation.get_evaluated(prediction_file, isint=isint)
        normalized_mutual_info = metrics.normalized_mutual_info_score(standard_labels, prediction_labels)

        return normalized_mutual_info
 def evaluate(self):
     ARI = round(metrics.adjusted_rand_score(self.labels, self.pred), 4)
     AMI = round(metrics.adjusted_mutual_info_score(self.labels, self.pred), 4)
     NMI = round(metrics.normalized_mutual_info_score(self.labels, self.pred), 4)
     print("Adjusted Rand index:", "%.4f" % ARI)
     print("Adjusted Mutual Information:", "%.4f" % AMI)
     print("Normalized Mutual Information:", "%.4f" % NMI)
  def pam_augmented_fit(self, feat, y, loss_mult):
    pam_max_iter = 5
    self._check_init_args()
    feat = self._check_array(feat)
    pdists = pairwise_distance_np(feat)
    self.loss_augmented_fit(feat, y, loss_mult)
    print('PAM -1 (before PAM): score: %f, score_aug: %f' % (
        self.score_, self.score_aug_))
    # Initialize from loss augmented facility location
    subset = self.center_ics_
    for iter_ in range(pam_max_iter):
      # update the cluster assignment
      cluster_ics = self._get_cluster_ics(pdists, subset)
      # update the medoid for each clusters
      self._augmented_update_medoid_ics_in_place(pdists, y, cluster_ics, subset,
                                                 loss_mult)
      self.score_ = np.float32(-1.0) * self._get_facility_distance(
          pdists, subset)
      self.score_aug_ = self.score_ + loss_mult * (
          1.0 - metrics.normalized_mutual_info_score(
              y, self._get_cluster_ics(pdists, subset)))
      self.score_aug_ = self.score_aug_.astype(np.float32)
      print('PAM iter: %d, score: %f, score_aug: %f' % (iter_, self.score_,
                                                        self.score_aug_))

    self.center_ics_ = subset
    self.labels_ = cluster_ics
    return self
示例#7
0
def compare(method1, method2, fig=False):
    X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1))
    X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2))
    
    print 'n_cluster\tHomo\tCompl\tNMI\tARI'
    for i in range(2, 6):
        clust1 = Clustering(species, method1, X1, None, n_clusters=i)
        clust2 = Clustering(species, method2, X2, None, n_clusters=i)
        
        clust1.agglomerative(linkage='ward')
        clust2.agglomerative(linkage='ward')
        
        label1 = clust1.pred_labels('ward')
        label2 = clust2.pred_labels('ward')
        
        
        if i == 3 and fig:
            names = np.unique(label1)
            figName = '{0}_{1}_on_{2}'.format(species, method1, method2)
            plot2d(X2, label1, names, figName, figName)

            names = np.unique(label2)
            figName = '{0}_{1}_on_{2}'.format(species, method2, method1)
            plot2d(X1, label2, names, figName, figName)
    
        print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2),
                                                metrics.completeness_score(label1, label2),
                                                metrics.normalized_mutual_info_score(label1, label2),
                                                metrics.adjusted_rand_score(label1, label2))
def test_diffusion_embedding_two_components_no_diffusion_time(seed=36):
    """Test spectral embedding with two components"""
    random_state = np.random.RandomState(seed)
    n_sample = 100
    affinity = np.zeros(shape=[n_sample * 2,
                               n_sample * 2])
    # first component
    affinity[0:n_sample,
             0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2
    # second component
    affinity[n_sample::,
             n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2
    # connection
    affinity[0, n_sample + 1] = 1
    affinity[n_sample + 1, 0] = 1
    affinity.flat[::2 * n_sample + 1] = 0
    affinity = 0.5 * (affinity + affinity.T)

    true_label = np.zeros(shape=2 * n_sample)
    true_label[0:n_sample] = 1
    geom_params = {'laplacian_method':'geometric'}
    se_precomp = SpectralEmbedding(n_components=1,
                                   random_state=np.random.RandomState(seed),
                                   eigen_solver = 'arpack',
                                   diffusion_maps = True,
                                   geom = geom_params)
    embedded_coordinate = se_precomp.fit_transform(affinity,
                                                   input_type='affinity')

    # thresholding on the first components using 0.
    label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float")
    assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
def plotMI(dat, lab, width = 0.35, signed = 0):
	'''
	Draw a bar chart of the normalized MI between each X and Y
	'''
	X = dat.drop(lab, 1)
	Y = dat[[lab]].values
	cols = X.columns.values
	mis = []

	#Start by getting MI
	for c in cols:
		mis.append(skm.normalized_mutual_info_score(Y.ravel(), X[[c]].values.ravel()))

	#Get signs by correlation
	corrs = dat.corr()[lab]
	corrs[corrs.index != lab]
	df = pd.DataFrame(list(zip(mis, cols)), columns = ['MI', 'Lab'])
	df = pd.concat([df, pd.DataFrame(list(corrs), columns = ['corr'])], axis=1, join_axes=[df.index])


	
	if signed == 0:
		makeBar(df, 'MI', 'Lab', width)

	else:
		makeBarSigned(df, 'MI', 'Lab', width)
示例#10
0
文件: tag_network.py 项目: wtgme/ohsn
def compare_direct_undir():
    from sklearn import metrics
    g = gt.Graph.Read_GraphML('ed_tag.graphml')
    gt.net_stat(g)
    gu = gt.Graph.Read_GraphML('ed_tag_undir.graphml')
    gt.net_stat(gu)
    com = g.community_infomap(edge_weights='weight', vertex_weights='weight')
    comu1 = gu.community_infomap(edge_weights='weight', vertex_weights='weight')
    comu2 = gu.community_infomap(edge_weights='weight', vertex_weights='weight')
    mem = com.membership
    memu1 = comu1.membership
    memu2 = comu2.membership
    print metrics.adjusted_rand_score(mem, memu1)
    print metrics.normalized_mutual_info_score(mem, memu1)
    print metrics.adjusted_rand_score(memu2, memu1)
    print metrics.normalized_mutual_info_score(memu2, memu1)
def test_spectral_embedding_two_components(seed=36):
    """Test spectral embedding with two components"""
    random_state = np.random.RandomState(seed)
    n_sample = 100
    affinity = np.zeros(shape=[n_sample * 2,
                               n_sample * 2])
    # first component
    affinity[0:n_sample,
             0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2
    # second component
    affinity[n_sample::,
             n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2
    # connection
    affinity[0, n_sample + 1] = 1
    affinity[n_sample + 1, 0] = 1
    affinity.flat[::2 * n_sample + 1] = 0
    affinity = 0.5 * (affinity + affinity.T)

    true_label = np.zeros(shape=2 * n_sample)
    true_label[0:n_sample] = 1

    se_precomp = SpectralEmbedding(n_components=1, affinity="precomputed",
                                   random_state=np.random.RandomState(seed))
    embedded_coordinate = se_precomp.fit_transform(affinity)
    # Some numpy versions are touchy with types
    embedded_coordinate = \
        se_precomp.fit_transform(affinity.astype(np.float32))
    # thresholding on the first components using 0.
    label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float")
    assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
示例#12
0
def calculate_nmi_kmeans(H, correct_label):
    km = KMeans(H.shape[1])
    try:
        com = km.fit_predict(H)
    except:
        com = [0] * H.shape[0]
    nmi = normalized_mutual_info_score(com, correct_label)
    return nmi
def cluster_metrics(labels_1, labels_2):
    print("\n".join(
        [
            "Normalized Mutual Information: %f" % (normalized_mutual_info_score(labels_1, labels_2)),
            "Adjusted Rand Score: %f" % (adjusted_rand_score(labels_1, labels_2)),
            "Homogeneity: %f" % (homogeneity_score(labels_1, labels_2)),
            "Completeness: %f" % (completeness_score(labels_1, labels_2))
        ]
    ))
示例#14
0
def normalized_mutual_info_score_scorefunc(X, y):

    scores = []
    pvals = []
    for col in range(X.shape[1]):
        scores.append(normalized_mutual_info_score(X[:, col], y))
        pvals.append(1)

    return np.array(scores), np.array(pvals)
示例#15
0
def model_metrics(model, X, y, batch_size):
    loss_and_metrics = model.evaluate(X, y, batch_size=batch_size)
    predicted_classes = model.predict_classes(X, batch_size=batch_size)
    predicted_probas = model.predict_proba(X, batch_size=batch_size)

    accuracy = loss_and_metrics[1]
    roc_auc = roc_auc_score(y, predicted_probas)
    nmi = normalized_mutual_info_score(y, predicted_classes.flatten())
    return accuracy, roc_auc, nmi
示例#16
0
def calc(gr_truth, predicted):
    # precision, recall, fscore, _ = score(gr_truth, predicted, average='micro')
    # print('precision: {}'.format(precision))
    # print('recall: {}'.format(recall))
    # print('fscore: {}'.format(fscore))
    # print('jaccard: {}'.format(jaccard_similarity_score(gr_truth, predicted, normalize=True)))
    # print('mutual: {}'.format(mutual_info_score(gr_truth, predicted)))
    # print('mutual adj: {}'.format(adjusted_mutual_info_score(gr_truth, predicted)))
    # print('mutual norm: {}'.format(normalized_mutual_info_score(gr_truth, predicted)))
    return normalized_mutual_info_score(gr_truth, predicted)
示例#17
0
def observe_correlations(nrows=1000000):
    training_file = "data/train.csv"
    dataframe = pd.read_csv(training_file,nrows=nrows,header=0)
    info_variable=pd.DataFrame(index=dataframe.columns[2:])
    info_variable["levels"]=dataframe.iloc[:,2:].apply(lambda t: np.unique(t).shape[0])
    info_variable["Entropy"]=dataframe.iloc[:,2:].apply(lambda t: entropy(t))
    info_variable["MutualInformation"]=dataframe.iloc[:,2:].apply(lambda t: metrics.normalized_mutual_info_score(dataframe["click"],t))
    info_variable["clustering"]=cluster_var(dataframe)
    info_variable=info_variable.sort("clustering")
    return info_variable
def mRMR(dfin, target, adjusted = False, n_features_to_select = 10):
    '''
    :param dfin: dataframe of features
    :param target: target (0,1)
    :return:
    '''

    df = dfin.copy()

    final_features = []
    importance = []

    if n_features_to_select > df.shape[1]:
        n_features_to_select = df.shape[1]-1

    #Iteratively create a subset of final features the feature that maximizes
    # MI(C, X_j) - (1/(m-1) * SUM_i (MI(X_i, X_j))
    #where C = the ultimate target, X_i = the feature to test, and X_j = features already selected
    while len(final_features) < n_features_to_select:
        features = np.array([c for c in df.columns.tolist() if c not in final_features])

        if adjusted:
            mi = np.array([adjusted_mutual_info_score(df[f], target) for f in features])
            if len(final_features) > 0:
                mr = np.array([normalized_mutual_info_score(df[x_j], df[x_i]) for x_j in features for x_i in features])
            else:
                mr = np.zeros(len(features))
        else:
            mi = [normalized_mutual_info_score(df[f], target) for f in features]
            if len(final_features) > 0:
                mr = np.array([normalized_mutual_info_score(df[x_j], df[x_i]) for x_j in features for x_i in features])
            else:
                mr = np.zeros(len(features))

        if len(final_features) > 0:
            mr = (1./(len(features) - 1)) * mr.reshape(len(features), len(features)).sum(axis = 1)

        mrmr = mi - mr
        final_features.append(features[mrmr == max(mrmr)][0])
        importance.append(max(mrmr))

    return final_features, importance
def getClusterMetricString(method_name, labels_true, labels_pred):
    '''
    Creates a formatted string containing the method name and acc, nmi metrics - can be used for printing
    :param method_name: Name of the clustering method (just for printing)
    :param labels_true: True label for each sample
    :param labels_pred: Predicted label for each sample
    :return: Formatted string containing metrics and method name
    '''
    acc = cluster_acc(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    return '%-50s     %8.3f     %8.3f' % (method_name, acc, nmi)
示例#20
0
def compare_clusters(nn_clusters, tf_clusters):
    """prints some comparisons"""
    print('~~Adjusted mutual information: {}'.format(
        metrics.adjusted_mutual_info_score(nn_clusters.labels_,
                                           tf_clusters.labels_)))
    print('~~Normalized mutual information: {}'.format(
        metrics.normalized_mutual_info_score(nn_clusters.labels_,
                                             tf_clusters.labels_)))
    print('~~Adjusted Rand Index: {}'.format(
        metrics.adjusted_rand_score(nn_clusters.labels_,
                                    tf_clusters.labels_)))
def test_pipeline_spectral_clustering(seed=36):
    # Test using pipeline to do spectral clustering
    random_state = np.random.RandomState(seed)
    se_rbf = SpectralEmbedding(n_components=n_clusters, affinity="rbf", random_state=random_state)
    se_knn = SpectralEmbedding(
        n_components=n_clusters, affinity="nearest_neighbors", n_neighbors=5, random_state=random_state
    )
    for se in [se_rbf, se_knn]:
        km = KMeans(n_clusters=n_clusters, random_state=random_state)
        km.fit(se.fit_transform(S))
        assert_array_almost_equal(normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2)
  def _augmented_update_medoid_ics_in_place(self, pdists, y_gt, cluster_ics,
                                            medoid_ics, loss_mult):
    for cluster_idx in range(self.n_clusters):
      # y_pred = self._get_cluster_ics(D, medoid_ics)
      # Don't prematurely do the assignment step.
      # Do this after we've updated all cluster medoids.
      y_pred = cluster_ics

      if sum(y_pred == cluster_idx) == 0:
        # Cluster is empty.
        continue

      curr_score = (
          -1.0 * np.sum(
              pdists[medoid_ics[cluster_idx], y_pred == cluster_idx]) +
          loss_mult * (1.0 - metrics.normalized_mutual_info_score(
              y_gt, y_pred)))

      pdist_in = pdists[y_pred == cluster_idx, :]
      pdist_in = pdist_in[:, y_pred == cluster_idx]

      all_scores_fac = np.sum(-1.0 * pdist_in, axis=1)
      all_scores_loss = []
      for i in range(y_pred.size):
        if y_pred[i] != cluster_idx:
          continue
        # remove this cluster's current centroid
        medoid_ics_i = medoid_ics[:cluster_idx] + medoid_ics[cluster_idx + 1:]
        # add this new candidate to the centroid list
        medoid_ics_i += [i]
        y_pred_i = self._get_cluster_ics(pdists, medoid_ics_i)
        all_scores_loss.append(loss_mult * (
            1.0 - metrics.normalized_mutual_info_score(y_gt, y_pred_i)))

      all_scores = all_scores_fac + all_scores_loss
      max_score_idx = np.argmax(all_scores)
      max_score = all_scores[max_score_idx]

      if max_score > curr_score:
        medoid_ics[cluster_idx] = np.where(
            y_pred == cluster_idx)[0][max_score_idx]
示例#23
0
文件: examples.py 项目: Procrat/som
def print_cluster_measures(som, test_data, test_labels):
    """Prints some clustering statistics."""

    pred_labels = som.predict(test_data)
    accuracy = metrics.accuracy_score(test_labels, pred_labels)
    adj_rand = metrics.adjusted_rand_score(test_labels, pred_labels)
    adj_mi = metrics.adjusted_mutual_info_score(test_labels, pred_labels)
    norm_mi = metrics.normalized_mutual_info_score(test_labels, pred_labels)
    print('  Accuracy:', accuracy)
    print('  Adjusted rand score:', adj_rand)
    print('  Adjusted mutual info:', adj_mi)
    print('  Normalized mutual info:', norm_mi)
示例#24
0
def compute_metrics(graph):
    results = []
    results.append(('kmeans',cluster_kmeans(G)))
    results.append(('agglo',cluster_agglomerative(G)))
    results.append(('spectral',cluster_spectral(G)))
    results.append(('affinity',cluster_affinity(G)))

    metric_results = [(name,
                metrics.normalized_mutual_info_score(groundTruth,x),
                metrics.adjusted_rand_score(groundTruth,x)) for name,x in results]

    return metric_results
def ceEvalMutual(cluster_runs, cluster_ensemble = None, verbose = False):
    """Compute a weighted average of the mutual information with the known labels, 
        the weights being proportional to the fraction of known labels.

    Parameters
    ----------
    cluster_runs : array of shape (n_partitions, n_samples)
        Each row of this matrix is such that the i-th entry corresponds to the
        cluster ID to which the i-th sample of the data-set has been classified
        by this particular clustering. Samples not selected for clustering
        in a given round are are tagged by an NaN.

    cluster_ensemble : array of shape (n_samples,), optional (default = None)
        The identity of the cluster to which each sample of the whole data-set 
        belong to according to consensus clustering.
 
    verbose : Boolean, optional (default = False)
        Specifies if status messages will be displayed
        on the standard output.

    Returns
    -------
    unnamed variable : float
        The weighted average of the mutual information between
        the consensus clustering and the many runs from the ensemble
        of independent clusterings on subsamples of the data-set.
    """

    if cluster_ensemble is None:
        return 0.0

    if reduce(operator.mul, cluster_runs.shape, 1) == max(cluster_runs.shape):
        cluster_runs = cluster_runs.reshape(1, -1)

    weighted_average_mutual_information = 0

    N_labelled_indices = 0

    for i in xrange(cluster_runs.shape[0]):
        labelled_indices = np.where(np.isfinite(cluster_runs[i]))[0]
        N = labelled_indices.size

        x = np.reshape(checkcl(cluster_ensemble[labelled_indices], verbose), newshape = N)
        y = np.reshape(checkcl(np.rint(cluster_runs[i, labelled_indices]), verbose), newshape = N)

        q = normalized_mutual_info_score(x, y)

        weighted_average_mutual_information += q * N
        N_labelled_indices += N

    return float(weighted_average_mutual_information) / N_labelled_indices
def maxNMI(labels,clust):
    max_NMI = 0.0
    
    for i in range(clust.num_leaves):
        clusters = clust[i]
        cluster_labels = np.zeros(len(labels))
        
        for index,cluster_IDs in enumerate(clusters):
            cluster_labels[cluster_IDs] = index
            
        NMI = metrics.normalized_mutual_info_score(labels,cluster_labels)
        if NMI > max_NMI:
            max_NMI = NMI
            
    return max_NMI
示例#27
0
def explore(a, b, ax, xlabel, ylabel):
    plt.scatter(a, b)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    mi = metrics.normalized_mutual_info_score(a, b)
    co = stats.pearsonr(a, b)

    text = "MI = {0:.6}".format(mi)
    text = text + ", Cor = {0:.6}".format(co[0])

    plt.text(0.95, 0.01, text,
             verticalalignment='bottom', horizontalalignment='right',
            transform=ax.transAxes,
            fontsize=15)
示例#28
0
def mutual_information(x, y, nbins=32, normalized=False):
    """
    Compute mutual information
    :param x: 1D numpy.array : flatten data from an image
    :param y: 1D numpy.array : flatten data from an image
    :param nbins: number of bins to compute the contingency matrix (only used if normalized=False)
    :return: float non negative value : mutual information
    """
    from sklearn.metrics import normalized_mutual_info_score, mutual_info_score
    if normalized:
        mi = normalized_mutual_info_score(x, y)
    else:
        c_xy = np.histogram2d(x, y, nbins)[0]
        mi = mutual_info_score(None, None, contingency=c_xy)
    # mi = adjusted_mutual_info_score(None, None, contingency=c_xy)
    return mi
示例#29
0
    def evaluate(self, labels):
        result = tuple()
        # true_label provided
        if self.n_clusters is not None:
            for label in labels:
                ami = metrics.adjusted_mutual_info_score(self.y, label)
                nmi = metrics.normalized_mutual_info_score(self.y, label)
                vmes = metrics.v_measure_score(self.y, label)
                ari = metrics.adjusted_rand_score(self.y, label)
                result = result + (ami, nmi, vmes, ari)
            ### END - for label
        ### END - if self.y

        else:
            for label in labels:
                result = result + (metrics.silhouette_score(self.X, label), )
        return result
示例#30
0
def cluster_var(dataframe):
    pd.set_option('expand_frame_repr', False)
    names=[x for x in dataframe.columns if x not in ["click","id","device_id","device_ip"]]
    m=len(names)
    MI=pd.DataFrame(np.zeros(shape=(m,m)),index=names,columns=names)
    for i in range(m):
        for j in range(i+1,m):
            MI.iloc[i,j]=metrics.normalized_mutual_info_score(dataframe.loc[:,names[i]],dataframe.loc[:,names[j]])
    for i in range(m):
        for j in range(i+1):
            if j==i:
                MI.iloc[i,j]=1
            else:
                MI.iloc[i,j]=MI.iloc[j,i]
    clustering=KMeans(n_clusters=10).fit_predict(MI)
    clustering=pd.Series(clustering,index=names)
    return clustering
示例#31
0
def myMechanism(n, e):
    a = readData()
    labels = np.zeros(shape=(958, 20))
    labels.dtype = 'int64'
    cluster_centers = np.zeros(shape=(n, 40))
    for i in range(20):
        # print(i)
        clf = KMeans(n_clusters=n, random_state=9)
        y_pred = clf.fit_predict(a[:, i, 2:4])
        labels[:, i] = clf.labels_
        cluster_centers[:, 2 * i:(2 * i + 2)] = clf.cluster_centers_
    newpaths = []
    for i in range(958):
        newpath = ""
        for j in range(20):
            string = "L" + str(labels[i, j])
            newpath += string
        newpaths.append(newpath)
    result = Counter(newpaths)
    newpathsdict = dict(result)

    # 随机生成轨迹补足数量
    while (len(newpathsdict) < 958):
        key = ""
        for i in range(20):
            string = "L" + str(random.randint(0, n - 1))
            key += string
        newpathsdict.setdefault(key, 0)

    # 补齐2的n次方haar变换和重构
    values = list(newpathsdict.values())
    a = int(math.log(len(values), 2))
    for index in range(int(math.pow(2, a + 1)) - len(values)):
        values.append(0)

    temp = buildHaarTreeList(values)
    # print(len(temp))
    noise = addNoise(len(temp), e)
    c = [temp[i] + noise[i] for i in range(len(temp))]
    noisecounts = rebuildHaarTreeList(c)

    i = 0
    newpathsdict2 = copy.deepcopy(newpathsdict);
    for key, value in newpathsdict.items():
        newpathsdict[key] = noisecounts[i]
        i = i + 1

    valueslist = sorted(list(newpathsdict.values()), reverse=True)
    newpathslist = list_sort_by_value(newpathsdict)

    truecounts = []
    for item in newpathslist:
        truecounts.append(newpathsdict2.get(item))

    # 根据一致性约束 trueconts 保序回归
    x = np.arange(958)
    y = np.array(truecounts)
    y_ = IsotonicRegression(increasing=False).fit_transform(x, y)

    # print(y.shape)
    NMI = metrics.normalized_mutual_info_score(y_, y)

    mapeyy = mape(y_, y)
    maeyy = metrics.mean_absolute_error(y, y_)
    # hausdorff_distance
    y.resize(1,958)
    y_.resize(1,958)
    # print(y.shape)
    hau_dis = hausdorff_distance(y, y_, distance="euclidean")


    return NMI, hau_dis,mapeyy,maeyy
        ## Expanding the encodings and the cluster centers.
        encodingsExpand = encodings.unsqueeze(1).expand(
            encodings.size(0), 10, 32).cuda()
        clusterCentersExpand = clusterCenters.clone().unsqueeze(0).expand(
            encodings.size(0), 10, 32).cuda()

        ## Computing the distances of the encodings from the cluster centers.
        distMat = torch.pow(encodingsExpand - clusterCentersExpand, 2).sum(2)

        ## Computing the cluster center label having the minimum distance from the particular image encoding.
        _, predClus = torch.min(distMat, dim=1)

        ## Adding the predictions to the global list.
        yPredAll.extend(list(predClus.data.cpu().numpy()))

        ## Adding the true labels to the global list.
        yTrueAll.extend(list(labelBatchV.data.cpu().numpy()))

## Computing the test NMI and Purity.
testPurity = purityScore(yPredAll, yTrueAll)
testNMI = normalized_mutual_info_score(yPredAll, yTrueAll)

print(testPurity)
print(testNMI)

# ## Writing data.
data = [str(int(argList.percentLabData * 100)), str(testPurity), str(testNMI)]
with open('./TestingResults/' + str(argList.dataSet) + '/Test.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow(data)
示例#33
0
import numpy as np
from sklearn import metrics

file1 = file_name_ground_truth
labels_true = np.loadtxt(file1)
file2 = file_name_to_check
labels = np.loadtxt(file2)

n = 70000  # number of points
NMI = metrics.normalized_mutual_info_score(labels,
                                           labels_true,
                                           average_method='arithmetic')

clustered = (labels >= 0)
ratio_points_clustered = np.sum(clustered) / n
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print("number of clusters: %d" % n_clusters)
print("number of noise: %d" % n_noise)
print("ratio of points clustered: %0.3f" % ratio_points_clustered)
print("NMI value: %0.3f" % NMI)
def f_mutual_info_score(var1, var2, var1type, var2type):
    values1, values2 = bin_variables(var1, var1type, var2, var2type)
    return metrics.normalized_mutual_info_score(values1, values2)
示例#35
0
def norm_mut_info(ground_truth, predicted):
    return metrics.normalized_mutual_info_score(ground_truth, predicted)
示例#36
0
n_clusters = n_classes(gt_labels)

shapelet_lengths = {}
for sz in [int(p * ts_len) for p in [.15, .3, .45]]:
    n_shapelets = int(numpy.log10(ts_len - sz) * ratio_n_shapelets)  # 2, 5, 8, 10
    shapelet_lengths[sz] = n_shapelets
print(dataset, shapelet_lengths, model_class.__name__)

m = model_class(shapelet_lengths, d=data.shape[2], print_loss_every=1000, ada_grad=True, niter=1000,
                print_approx_loss=True)
m.fit(data)
for ikiter in range(nkiter):
    m.partial_fit(data, 1000, (ikiter + 1) * 1000)
    model_fname = "%s%s_%dkiter_%s_%s.model" % (model_path, m.__class__.__name__, ikiter + 1, dataset, oar_job_id)
    m.dump_without_dists(model_fname)

model_fname = "%s%s_final_%s.model" % (model_path, m.__class__.__name__, dataset)
m.dump_without_dists(model_fname)
print("Saved model %s with approximate loss: %f (beta=%f)" % (model_fname, m._loss(data), m.beta))

data_shtr = numpy.empty((data.shape[0], sum(shapelet_lengths.values())))
for i in range(data.shape[0]):
    data_shtr[i] = m._shapelet_transform(data[i])

km = KMeans(n_clusters=n_clusters)
pred_labels = km.fit_predict(data_shtr)
print("Model=%s, dataset=%s, NMI(kmeans)=%f" % (m.__class__.__name__, dataset,
                                                normalized_mutual_info_score(gt_labels, pred_labels)))


def calc_nmi(a, b):
    a = np.around(a, decimals=2)
    b = np.around(b, decimals=2)
    return metrics.normalized_mutual_info_score(a, b)
示例#38
0
def bench_desom(X_train,
                y_train,
                dataset,
                map_size,
                encoder_dims,
                ae_weights=None):
    print('*** {} - desom with {} map and {} autoencoder (gamma={})***'.format(
        dataset, map_size, encoder_dims, gamma))

    desom = DESOM(encoder_dims=encoder_dims, map_size=map_size)
    save_dir = 'results/benchmark/desom-gamma{}_{}_{}_{}x{}'.format(
        gamma, dataset, optimizer, map_size[0], map_size[1])
    subprocess.run(['mkdir', '-p', save_dir])

    for run in range(n_runs):
        desom.initialize()
        desom.compile(gamma=gamma, optimizer=optimizer)
        if ae_weights is not None:
            desom.load_ae_weights(ae_weights)
        # Weights initialization by randomly sampling training points
        desom.init_som_weights(X_train)
        t0 = time.time()
        desom.fit(X_train, y_train, None, None, iterations, som_iterations,
                  eval_interval, save_epochs, batch_size, Tmax, Tmin, decay,
                  save_dir)
        dt = time.time() - t0
        print('Run {}/{} (took {:f} seconds)'.format(run + 1, n_runs, dt))
        y_pred = desom.predict(X_train)
        pur[run] = cluster_purity(y_train, y_pred)
        nmi[run] = normalized_mutual_info_score(y_train, y_pred)
        ari[run] = adjusted_rand_score(y_train, y_pred)
        acc[run] = cluster_acc(y_train, y_pred)
        duration[run] = dt
        if map_size[0] == 8:
            # Post clustering in latent space
            print('Post-clustering in latent space...')
            prototypes = desom.prototypes
            km_desom = KMeans(n_clusters=np.max(y_train),
                              n_jobs=-1).fit(prototypes)
            km_desom_pred = km_desom.predict(desom.encode(X_train))
            pur_clust[run] = cluster_purity(y_train, km_desom_pred)
            nmi_clust[run] = normalized_mutual_info_score(
                y_train, km_desom_pred)
            ari_clust[run] = adjusted_rand_score(y_train, km_desom_pred)
            acc_clust[run] = cluster_acc(y_train, km_desom_pred)

    name = '{} (K={})'.format(dataset, map_size[0] * map_size[1])
    results.at[name, 'pur'] = pur.mean()
    results.at[name, 'pur_std'] = pur.std()
    results.at[name, 'nmi'] = nmi.mean()
    results.at[name, 'nmi_std'] = nmi.std()
    results.at[name, 'ari'] = ari.mean()
    results.at[name, 'ari_std'] = ari.std()
    results.at[name, 'acc'] = acc.mean()
    results.at[name, 'acc_std'] = acc.std()
    results.at[name, 'duration'] = duration.mean()
    if map_size[0] == 8:
        # Post clustering
        results.at[name, 'pur_clust'] = pur_clust.mean()
        results.at[name, 'pur_clust_std'] = pur_clust.std()
        results.at[name, 'nmi_clust'] = nmi_clust.mean()
        results.at[name, 'nmi_clust_std'] = nmi_clust.std()
        results.at[name, 'ari_clust'] = ari_clust.mean()
        results.at[name, 'ari_clust_std'] = ari_clust.std()
        results.at[name, 'acc_clust'] = acc_clust.mean()
        results.at[name, 'acc_clust_std'] = acc_clust.std()

    print(results.loc[name])
示例#39
0
def fitmodel(X, pheno, estimator, parameters, modelname, method, performance, times):
    score_store = 0

    kfold = KFold(n_splits=5)
    for train_index, test_index in kfold.split(X, pheno):
        # time how long it takes to train each model type
        start = time.process_time()

        # split data into train/test sets
        X_train = X.iloc[train_index]
        y_train = pheno[train_index]
        X_test = X.iloc[test_index]
        y_test = pheno[test_index]

        # perform grid search to identify best hyper-parameters
        train_time = time.time()
        gs_clf =estimator
        gs_clf.fit(X_train, y_train)
        train_end = time.time() - train_time

        # predict resistance in test set
        predict_time = time.time()
        y_pred = gs_clf.predict(X_test)
        y_pred[y_pred < 0.5] = 0
        y_pred[y_pred > 0.5] = 1
        predict_end = time.time() - predict_time


        eval_time = time.time()
        score = balanced_accuracy_score(y_test, y_pred)
        balanced_accuracy_scoreval = balanced_accuracy_score(y_test, y_pred)
        average_precision_scoreval = average_precision_score(y_test, y_pred)
        brier_score_lossval =  brier_score_loss(y_test, y_pred)
        f1_scoreval =  f1_score(y_test, y_pred)
        precision_scoreval = precision_score(y_test, y_pred)
        recall_scoreval = recall_score(y_test, y_pred)
        jaccard_scoreval = jaccard_score(y_test, y_pred)
        roc_auc_scoreval = roc_auc_score(y_test, y_pred)
        adjusted_mutual_info_scoreval = adjusted_mutual_info_score(y_test, y_pred)
        adjusted_rand_scoreval = adjusted_rand_score(y_test, y_pred)
        completeness_scoreval = completeness_score(y_test, y_pred)
        fowlkes_mallows_scoreval = fowlkes_mallows_score(y_test, y_pred)
        homogeneity_scoreval = normalized_mutual_info_score(y_test, y_pred)
        v_measure_scoreval = v_measure_score(y_test, y_pred)
        explained_variance_scoreval = explained_variance_score(y_test, y_pred)
        max_errorval = max_error(y_test, y_pred)
        mean_absolute_errorval = mean_absolute_error(y_test, y_pred)
        mean_squared_errorval = mean_squared_error(y_test, y_pred)
        mean_squared_log_errorval = mean_squared_log_error(y_test, y_pred)
        median_absolute_errorval = median_absolute_error(y_test, y_pred)
        r2_scoreval = r2_score(y_test, y_pred)
        eval_end = time.time() - eval_time


















        # Create classifiers
        gnb = model

        # #############################################################################
        # Plot calibration plots

        plt.figure(figsize=(10, 10))
        ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
        ax2 = plt.subplot2grid((3, 1), (2, 0))

        ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
        for clf, name in [
                          (gnb, '%s'%algo),

                          ]:
            clf.fit(X_train, y_train)
            if hasattr(clf, "predict_proba"):
                prob_pos = clf.predict_proba(X_test)[:, 1]
            else:  # use decision function
                prob_pos = clf.decision_function(X_test)
                prob_pos = \
                    (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
            fraction_of_positives, mean_predicted_value = \
                calibration_curve(y_test, prob_pos, n_bins=10)

            ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
                     label="%s" % (name,))

            ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
                     histtype="step", lw=2)

        ax1.set_ylabel("Fraction of positives")
        ax1.set_ylim([-0.05, 1.05])
        ax1.legend(loc="lower right")
        ax1.set_title('Calibration plots  (%s_%s)'% (algo,score))

        ax2.set_xlabel("Mean predicted value")
        ax2.set_ylabel("Count")
        ax2.legend(loc="upper center", ncol=2)

        plt.tight_layout()
        plt.savefig('%s_%s_Calibration_plots.svg' % (algo,score))














        performance = np.append(performance, score)
        method = np.append(method, modelname)
        times = np.append(times, (time.process_time() - start))


        # print("Confusion matrix for this fold")
        # print(algo, 'Accuracy Score:',score*100)
        print('###################################################')



        filename = '%s_Acc: %s.sav' % (algo, score)
        joblib.dump(gs_clf, filename)
        print(filename)

        to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s`,`%s\n" % (
        balanced_accuracy_scoreval, average_precision_scoreval, brier_score_lossval, f1_scoreval, precision_scoreval,
        recall_scoreval, jaccard_scoreval, roc_auc_scoreval, adjusted_mutual_info_scoreval, adjusted_rand_scoreval, completeness_scoreval,
        fowlkes_mallows_scoreval,
        homogeneity_scoreval, v_measure_scoreval, explained_variance_scoreval, max_errorval, mean_absolute_errorval,
        mean_squared_errorval, mean_squared_log_errorval, median_absolute_errorval, r2_scoreval, algo,train_end,predict_end,eval_end)

        Eval_file.write(str(to_write))





    return gs_clf, method, performance, times
示例#40
0
def k_means_results(name, A, B, x_label, y_label, colormap):
    X = A[0]
    y = A[1]
    X_test = B[0]
    y_test = B[1]
    h = .02
    n_clusters = 2
    k_means = KMeans(n_clusters=n_clusters)
    start = time.time()
    fit_results = k_means.fit(X)
    end = time.time()
    print 'Fit Time: ' + str(end - start)
    Y_kmeans = k_means.predict(X)

    y_pred = Y_kmeans
    y_true = y

    print 'Train Accuracy Score Default'
    print metrics.accuracy_score(y_true, y_pred)
    y_pred = map(flip, Y_kmeans)
    print 'Train Accuracy Score Flip Labels'
    print metrics.accuracy_score(y_true, y_pred)
    print 'Classification Report'
    print metrics.classification_report(y_true, y_pred)
    print 'Confusion Matrix'
    print metrics.confusion_matrix(y_true, y_pred)
    print 'Completeness Score'
    print metrics.completeness_score(y_true, y_pred)
    print 'Homogeneity Score'
    print metrics.homogeneity_score(y_true, y_pred)
    print 'Homogeneity Completeness V Measured'
    print metrics.homogeneity_completeness_v_measure(y_true, y_pred)
    print 'Mutual Information Score'
    print metrics.mutual_info_score(y_true, y_pred)
    print 'Normalized Mutual Info Score'
    print metrics.normalized_mutual_info_score(y_true, y_pred)
    print 'Silhouette Score'
    print metrics.silhouette_score(X, fit_results.labels_)
    print 'Silhouette Samples'
    print metrics.silhouette_samples(X, fit_results.labels_)
    print 'V Measure Score'
    print metrics.v_measure_score(y_true, y_pred)

    figure_identifier = plt.figure()
    colors = ['yellow', 'cyan']
    if colormap:
        cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA'])
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    for i in xrange(len(colors)):
        px = X[:, 0][Y_kmeans == i]
        py = X[:, 1][Y_kmeans == i]
        plt.scatter(px, py, c=colors[i])
    plt.scatter(fit_results.cluster_centers_[0, 0:1],
                fit_results.cluster_centers_[0, 1:2],
                s=100,
                linewidths=4,
                c='orange',
                marker='x')
    plt.scatter(fit_results.cluster_centers_[1, 0:1],
                fit_results.cluster_centers_[1, 1:2],
                s=100,
                linewidths=4,
                c='orange',
                marker='x')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(name + ' Train Results')
    #    plt.show()
    plt.savefig('figures/' + name.replace(' ', '_') + '_Training_results.png')
    figure_identifier.clf()
    plt.close(figure_identifier)

    print_confusion_matrix('Train', Y_kmeans, y)
    figure_identifier = plt.figure()
    Y_kmeans = k_means.predict(X_test)
    y_pred = Y_kmeans
    y_true = y_test

    print 'Test Accuracy Score Default'
    print metrics.accuracy_score(y_true, y_pred)
    y_pred = map(flip, Y_kmeans)
    print 'Test Accuracy Score Flip Labels'
    print metrics.accuracy_score(y_true, y_pred)
    colors = ['yellow', 'cyan']
    if colormap:
        cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA'])
        x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
        y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    for i in xrange(len(colors)):
        px = X_test[:, 0][Y_kmeans == i]
        py = X_test[:, 1][Y_kmeans == i]
        plt.scatter(px, py, c=colors[i])
    plt.scatter(fit_results.cluster_centers_[0, 0:1],
                fit_results.cluster_centers_[0, 1:2],
                s=100,
                linewidths=4,
                c='orange',
                marker='x')
    plt.scatter(fit_results.cluster_centers_[1, 0:1],
                fit_results.cluster_centers_[1, 1:2],
                s=100,
                linewidths=4,
                c='orange',
                marker='x')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(name + ' Test Results')
    #    plt.show()
    plt.savefig('figures/' + name.replace(' ', '_') + '_Test_results.png')
    print_confusion_matrix('Test', Y_kmeans, y_test)
    figure_identifier.clf()
    plt.close(figure_identifier)
示例#41
0
        sess.run(init_op)
# Training cycle
# Fit training with Backpropagation using batch data.
for epoch in range(n_layers):
    miniData, _ = trainSet_cosine.next_batch(n_batch)
    _, new_cost = sess.run([train_optimizer, cost],
                           feed_dict={
                               x: miniData,
                               mode_train: True
                           })
    #------------------------- End of the Optimization ------------------------------
    # Save the results after per 10 epochs.
    # Getting embedded codes and running K-Means.
    ae_codes_cos = sess.run(code, feed_dict={x: data_cos, mode_train: False})
    idx_cos = k_means_(ae_codes_cos, n_clusters)
    ae_nmi_cos = normalized_mutual_info_score(labels_cos, idx_cos)
    ae_nmi_cos = ae_nmi_cos * 100
    results_cos.append(ae_nmi_cos)
    steps_cos.append(epoch)
    loss_cost_cos.append(new_cost)
    print(
        "NMI Score for AE is: {:0.2f} and new cost is: {:0.2f} in {:d} step. ".
        format(ae_nmi_cos, new_cost, epoch))
warnings.filterwarnings('ignore')
plt.figure(figsize=(12, 3.5))
plt.subplot(1, 2, 1)
plt.ylim(-0.5, 100)
plt.plot(steps_cos,
         loss_cost_cos,
         label='Cost Trianing for Cosine Distance ',
         marker='o')
示例#42
0
    def btnConvert_click(self):
        msgBox = QMessageBox()

        # Linkage
        Linkage = ui.cbLinkage.currentData()
        # Affinity
        Affinity = ui.cbAffinity.currentData()
        # Tree
        Tree = ui.cbTree.currentData()

        # NCluster
        try:
            NCluster = np.int32(ui.txtNCluster.text())
        except:
            msgBox.setText("Number of Cluster is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Filter
        try:
            Filter = ui.txtFilter.text()
            if not len(Filter):
                Filter = None
            else:
                Filter = Filter.replace("\'", " ").replace(",", " ").replace(
                    "[", "").replace("]", "").split()
                Filter = np.int32(Filter)
        except:
            print("Filter is wrong!")
            return

        # OutFile
        OutFile = ui.txtOutFile.text()
        if not len(OutFile):
            msgBox.setText("Please enter out file!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        OutData = dict()
        OutData["ModelAnalysis"] = "Agglomerative"

        # OutModel
        OutModel = ui.txtOutModel.text()
        if not len(OutModel):
            OutModel = None

        # InFile
        InFile = ui.txtInFile.text()
        if not len(InFile):
            msgBox.setText("Please enter input file!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False
        if not os.path.isfile(InFile):
            msgBox.setText("Input file not found!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        InData = io.loadmat(InFile)
        # Data
        if not len(ui.txtData.currentText()):
            msgBox.setText("Please enter Input Train Data variable name!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Label
        if not len(ui.txtLabel.currentText()):
            msgBox.setText("Please enter Train Input Label variable name!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        X = InData[ui.txtData.currentText()]
        L = InData[ui.txtLabel.currentText()][0]

        try:
            if Filter is not None:
                for fil in Filter:
                    # Remove Training Set
                    labelIndx = np.where(L == fil)[0]
                    L = np.delete(L, labelIndx, axis=0)
                    X = np.delete(X, labelIndx, axis=0)
                    print("Class ID = " + str(fil) + " is removed from data.")

            if ui.cbScale.isChecked():
                X = preprocessing.scale(X)
                print("Whole of data is scaled to N(0,1).")
        except:
            print("Cannot load data or label")
            return

        try:
            cls = AgglomerativeClustering(n_clusters=NCluster,
                                          affinity=Affinity,
                                          compute_full_tree=Tree,
                                          linkage=Linkage)
            print("Run Clustering ...")
            PeL = cls.fit_predict(X)
        except Exception as e:
            print(e)
            msgBox = QMessageBox()
            msgBox.setText(str(e))
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return

        OutData["predict"] = PeL

        if OutModel is not None:
            joblib.dump(cls, OutModel)
            print("Model is saved: " + OutModel)

        if ui.cbAverage.isChecked():
            acc = accuracy_score(L, PeL)
            OutData["Accuracy"] = acc
            print("Average                             {:5.2f}".format(acc *
                                                                       100))

        if ui.cbNMI.isChecked():
            NMI = normalized_mutual_info_score(L, PeL)
            OutData["NMI"] = NMI
            print("Normalized Mutual Information (NMI) {:7.6f}".format(NMI))

        if ui.cbRIA.isChecked():
            RIA = adjusted_rand_score(L, PeL)
            OutData["RIA"] = RIA
            print("Rand Index Adjusted (RIA)           {:7.6f}".format(RIA))

        if ui.cbAMI.isChecked():
            AMI = adjusted_mutual_info_score(L, PeL)
            OutData["AMI"] = AMI
            print("Adjusted Mutual Information (AMI)   {:7.6f}".format(AMI))

        print("Saving ...")
        io.savemat(OutFile, mdict=OutData)
        print("DONE.")
        msgBox.setText("Agglomerative Clustering is done.")
        msgBox.setIcon(QMessageBox.Information)
        msgBox.setStandardButtons(QMessageBox.Ok)
        msgBox.exec_()
示例#43
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    print('Bröther may i have some self-lööps')
    n_nodes = FLAGS.n_nodes
    n_clusters = FLAGS.n_clusters
    train_size = FLAGS.train_size
    data_clean, data_dirty, labels = overlapping_gaussians(n_nodes, n_clusters)
    graph_clean = construct_knn_graph(data_clean).todense().A1.reshape(
        n_nodes, n_nodes)

    train_mask = np.zeros(n_nodes, dtype=np.bool)
    train_mask[np.random.choice(np.arange(n_nodes),
                                int(n_nodes * train_size),
                                replace=False)] = True
    test_mask = ~train_mask
    print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}')
    print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}')

    input_features = tf.keras.layers.Input(shape=(2, ))
    input_features_corrupted = tf.keras.layers.Input(shape=(2, ))
    input_graph = tf.keras.layers.Input((n_nodes, ))

    encoder = [GCN(64), GCN(32)]
    model = deep_graph_infomax(
        [input_features, input_features_corrupted, input_graph], encoder)

    def loss(model, x, y, training):
        _, y_ = model(x, training=training)
        return loss_object(y_true=y, y_pred=y_)

    def grad(model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = loss(model, inputs, targets, training=True)
            for loss_internal in model.losses:
                loss_value += loss_internal
        return loss_value, tape.gradient(loss_value, model.trainable_variables)

    labels_dgi = tf.concat([tf.zeros([n_nodes, 1]), tf.ones([n_nodes, 1])], 0)
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate)

    for epoch in range(FLAGS.n_epochs):
        data_corrupted = data_dirty.copy()
        perc_shuffle = np.linspace(1, 0.05, FLAGS.n_epochs)[epoch]
        # perc_shuffle = 1
        rows_shuffle = np.random.choice(np.arange(n_nodes),
                                        int(n_nodes * perc_shuffle))
        data_corrupted_tmp = data_corrupted[rows_shuffle]
        np.random.shuffle(data_corrupted_tmp)
        data_corrupted[rows_shuffle] = data_corrupted_tmp
        loss_value, grads = grad(model,
                                 [data_dirty, data_corrupted, graph_clean],
                                 labels_dgi)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print('epoch %d, loss: %0.4f, shuffle %0.2f%%' %
              (epoch, loss_value.numpy(), 100 * perc_shuffle))
    representations, _ = model([data_dirty, data_corrupted, graph_clean],
                               training=False)
    representations = representations.numpy()
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    clf.fit(representations[train_mask], labels[train_mask])
    clusters = clf.predict(representations[test_mask])
    print(
        'NMI:',
        normalized_mutual_info_score(labels[test_mask],
                                     clusters,
                                     average_method='arithmetic'))
    print('Accuracy:', 100 * accuracy_score(labels[test_mask], clusters))
def nmi_score(labels_true, labels_pred):
    return metrics.normalized_mutual_info_score(labels_true, labels_pred)
                                       exch_mat=exch_mat,
                                       w_mat=w_mat,
                                       n_group=n_group,
                                       alpha=alpha,
                                       beta=beta,
                                       kappa=kappa,
                                       init_labels=known_labels)

        # Compute the groups
        algo_group_vec = np.argmax(result_matrix, 1) + 1

        # Restrained results
        rstr_real_group_vec = np.delete(real_group_vec,
                                        indices_for_known_label)
        rstr_algo_group_vec = np.delete(algo_group_vec,
                                        indices_for_known_label)

        # Compute nmi score
        nmi_vec.append(
            normalized_mutual_info_score(rstr_real_group_vec,
                                         rstr_algo_group_vec))

    # Writing results
    nmi_mean = np.mean(nmi_vec)
    nmi_std = np.std(nmi_vec)
    with open(results_file_name, "a") as output_file:
        output_file.write(
            f"{input_file},{sim_tag},{dist_option},{exch_mat_opt},{exch_range},{alpha},{beta},{kappa},"
            f"{known_label_ratio},{n_test},{nmi_mean},{nmi_std * 1.96 / np.sqrt(n_test)}\n"
        )
示例#46
0
        f.write('True Labels = ')
        f.write(str(labels))
        f.write('Clusters Obtained = ')
        f.write(str(clusters.tolist()))

        if len(clusters) == len(labels):

            f.write('Clusters Obtained = ' + str(np.asarray(labels)))

            f.write("\nResults\n")

            rand_index = metrics.adjusted_rand_score(labels, clusters)
            rand_indexes.append(rand_index)
            print 'rand_index = ' + str(rand_index)
            f.write("Rand Index = " + str(rand_index) + "\n")

            NMI_index = metrics.normalized_mutual_info_score(labels, clusters)
            nmi_indexes.append(NMI_index)
            print 'NMI_index = ' + str(NMI_index)
            f.write("NMI Index = " + str(NMI_index) + "\n")

    if rep > 1:
        f.write("\nFINAL RESULTS\n")
        f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) +
                "\n")
        f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) +
                "\n")
        f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) + "\n")
        f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) + "\n")
    f.close()
def mutualInformationOfSymbols(s1, s2, lenP):
    return normalized_mutual_info_score(s1, s2)
digits = load_digits()
data = scale(digits.data)

n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels_true = digits.target

sample_size = 300
# Compute clustering with MeanShift

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(data)
labels_pred = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels_pred)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" %
      metrics.homogeneity_score(labels_true, labels_pred))
print("Completeness: %0.3f" %
      metrics.completeness_score(labels_true, labels_pred))
print("NMI: %0.3f" % metrics.normalized_mutual_info_score(
    labels_true, labels_pred, average_method='arithmetic'))
示例#49
0
X = data

n_digits = len(np.unique(digits.target))
labels_true = digits.target

# #############################################################################

agg = AgglomerativeClustering(n_clusters=6, linkage="average")
agg.fit(X)
labels = agg.labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)
print("Normalized: %0.3f" %
      metrics.normalized_mutual_info_score(labels_true, labels))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))

print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# # #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    X = PCA(n_components=2).fit_transform(X)
示例#50
0
from sklearn import metrics
from cluster_acc import acc

nmi = metrics.normalized_mutual_info_score(train_label,my_dp.z)
ari = metrics.adjusted_rand_score(train_label,my_dp.z)
label_uni = len(np.unique(my_dp.z))
label_true_uni = len(np.unique(train_label))
    if label_uni <label_true_uni+1:
        acc_value = acc(my_dp.z,train_label)
    else:
        acc_value=0
示例#51
0
train_set, valid_set, test_set = cPickle.load(f)
f.close()

# perform SC on the test set
data_x, data_y = test_set

k = 12
nClass = 500
A = kneighbors_graph(data_x, k)
V = spectral_embedding(A, n_components=10, drop_first=False)
V = V + numpy.absolute(numpy.min(V))
#V = V/numpy.amax(V)
#
km_model = KMeans(n_clusters=nClass)
ypred = km_model.fit_predict(V)
nmi = metrics.normalized_mutual_info_score(data_y, ypred)
print('The NMI is: %.4f' % nmi)
#
V = numpy.float32(V)

f = gzip.open('EVD-test500.pkl.gz', 'wb')
cPickle.dump([(V, data_y), 0, 0], f, protocol=2)
f.close()
#sio.savemat('V_train_10.mat', {'train_x': V, 'train_y': data_y})

#sc = SpectralClustering(n_clusters = nClass, affinity = 'nearest_neighbors', n_neighbors = k)
#sc = SpectralClustering(n_clusters = 10, affinity = 'rbf',gamma = 1, n_neighbors = 10)

#data_x = data_x[0:1000]
#data_y = data_y[0:1000]
示例#52
0
def NMI(labels_real, labels_pred):
    nmi = metrics.normalized_mutual_info_score(labels_real, labels_pred)
    return nmi
示例#53
0
    del sample
    for i in range(epoch):
        print '\rprocess %d out of %d' % (i, epoch),
        #hdp.process_groups(groups)
        hdp.process_groups(random.sample(groups, batchgroup))
    print '\nfinished'

    labels_true = []
    labels_pred = []
    for l, group in enumerate(groups):
        X = group.data.X
        Y = hdp.predict(X, group).tolist()
        labels_true.extend([l] * len(X))
        labels_pred.extend(Y)
    return labels_true, labels_pred


if __name__ == '__main__':
    fname = sys.argv[1]
    labels_true, labels_pred = hdpcluster(fname)
    print 'CN:\t', len(set(labels_pred))
    print 'ARI:\t', metrics.adjusted_rand_score(labels_true, labels_pred)
    #print 'MI:\t', metrics.mutual_info_score(labels_true, labels_pred)
    print 'AMI:\t', metrics.adjusted_mutual_info_score(labels_true,
                                                       labels_pred)
    print 'NMI:\t', metrics.normalized_mutual_info_score(
        labels_true, labels_pred)
    #print 'HOM:\t', metrics.homogeneity_score(labels_true, labels_pred)
    #print 'COM:\t', metrics.completeness_score(labels_true, labels_pred)
    print 'VME:\t', metrics.v_measure_score(labels_true, labels_pred)
示例#54
0
def K_Means():  #1
    global cluster
    global labels
    cluster = KMeans(n_clusters=110).fit_predict(text)
    print("K-Means NMI:%s" %
          (metrics.normalized_mutual_info_score(labels, cluster)))
示例#55
0
                    p0 = int(pairElement[0].replace('(', ''))
                    pic_new.putpixel((p1, p0), int(256 / (int(label) + 1)))
                    predict_matrix[p0, p1] = label

            index += 1

    pic_new.save(
        "../aftersegmentationimagenew/3000323mock-1000-10k5number5.jpg",
        "JPEG")

    predictList = []
    for i in range(lab_arr.shape[0]):
        for j in range(lab_arr.shape[1]):
            predictList.append(predict_matrix[i][j])
    regionslabel = loadlabel("../imagelabel/3000323.regions.txt")
    layerslabel = loadlabel("../imagelabel/3000323.layers.txt")
    surfaceslabel = loadlabel("../imagelabel/3000323.surfaces.txt")

    mockdata = getsuperpixelData("../mockimagedata/3000323mocksuperPixel.txt")
    pbmlabel = []
    pbmlabel.append(mocklabel)
    result, pbmvalue = computePBM(mockdata, pbmlabel)

    regionslabel = normalized_mutual_info_score(regionslabel, predictList)
    layerslabel = normalized_mutual_info_score(layerslabel, predictList)
    surfaceslabel = normalized_mutual_info_score(surfaceslabel, predictList)

    print "pbm值为为%s" % pbmvalue
    print "regionslabel为%s" % regionslabel
    print "layerslabel为%s" % layerslabel
    print "surfaceslabel为%s" % surfaceslabel
示例#56
0
文件: cifar100.py 项目: DramaCow/ABC
                # validation
                if batch_id % val_freq == 0:
                    model.eval()
                    with torch.no_grad():
                        nmi = 0
                        nmi_known = 0
                        for i, (_, X, y, k) in enumerate(val_loader, 1):
                            X = X.to(device)
                            Ap = torch.sigmoid(model(X))
                            kp = (lib.eigengaps(Ap).argmax(-1) + 1).cpu()
                            Ap = Ap.cpu()

                            plabels = torch.tensor([SpectralClustering(int(kp[j]), affinity='precomputed').fit(Ap[j]).labels_ for j in range(batch_size)])
                            plabels_known = torch.tensor([SpectralClustering(int(k[j]), affinity='precomputed').fit(Ap[j]).labels_ for j in range(batch_size)])

                            nmi = nmi + (1 / i) * (np.mean([normalized_mutual_info_score(y[j], plabels[j]) for j in range(batch_size)]) - nmi)
                            nmi_known = nmi_known + (1 / i) * (np.mean([normalized_mutual_info_score(y[j], plabels_known[j]) for j in range(batch_size)]) - nmi_known)
                        writer.add_scalar('Training/ValidationNMI', nmi, val_step)
                        writer.add_scalar('Training/ValidationNMI_Known', nmi_known, val_step)
                        val_step += 1
                    model.train()

                # checkpoint
                if batch_id % checkpoint_freq == 0:
                    print('CHECKPOINT')
                    last_checkpoint = os.path.join(CHECKPOINTDIR, '%s_%d_%09d.ckpt' % (name, epoch, batch_id))
                    torch.save(model.state_dict(), last_checkpoint)
        print('DONE')

    # === evaluation ===
示例#57
0
i = 44
filename = 'data-' + str(i) + '.pkl.gz'
K = (i + 1) * 4
# path = '/home/bo/Data/RCV1/Processed/'
path = 'data/RCV1/Processed/'
dataset = path + filename

#np.random.seed(seed = 1)
## perform KM

train_x, train_y = load_data(dataset)
km_model = KMeans(n_clusters=K, n_init=1)
results_KM = np.zeros((trials, 3))
for i in range(trials):
    ypred = km_model.fit_predict(train_x)
    nmi = metrics.normalized_mutual_info_score(train_y, ypred)
    ari = metrics.adjusted_rand_score(train_y, ypred)
    ac = acc(ypred, train_y)
    results_KM[i] = np.array([nmi, ari, ac])

KM_mean = np.mean(results_KM, axis=0)
KM_std = np.std(results_KM, axis=0)

# perform DCN

#   for RCV1
config_1 = {
    'Init': '',
    'lbd': 0.1,
    'beta': 1,
    'output_dir': 'RCV_results',
def NMI(true_labels, predict_labels):
    return normalized_mutual_info_score(true_labels, predict_labels)
示例#59
0
文件: evaluation.py 项目: jtpils/BCKM
def mutual_info(predict, truth):
    return metrics.normalized_mutual_info_score(truth, predict)
示例#60
0
def DisKmeans(db, update_interval=None):
    from sklearn.cluster import KMeans
    from sklearn.mixture import GMM
    from sklearn.lda import LDA
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import normalized_mutual_info_score
    from scipy.spatial.distance import cdist
    import cPickle
    from scipy.io import loadmat

    if db == 'mnist':
        N_class = 10
        batch_size = 100
        train_batch_size = 256
        X, Y = read_db(db + '_total', True)
        X = np.asarray(X, dtype=np.float64)
        Y = np.asarray(np.squeeze(Y), dtype=np.int32)
        N = X.shape[0]
        img = np.clip((X / 0.02), 0, 255).astype(np.uint8).reshape(
            (N, 28, 28, 1))
    elif db == 'stl':
        N_class = 10
        batch_size = 100
        train_batch_size = 256
        img = read_db('stl_img', False)[0]
        img = img.reshape((img.shape[0], 96, 96, 3))
        X, Y = read_db(db + '_total', True)
        X = np.asarray(X, dtype=np.float64)
        Y = np.asarray(np.squeeze(Y), dtype=np.int32)
        N = X.shape[0]
    elif db == 'reuters':
        N_class = 4
        batch_size = 100
        train_batch_size = 256
        Y = np.fromfile('reuters.npy', dtype=np.int64)
        N = Y.shape[0]
    elif db == 'reutersidf':
        N_class = 4
        batch_size = 100
        train_batch_size = 256
        Y = np.load('reutersidf.npy')
        N = Y.shape[0]
    elif db == 'reuters10k' or db == 'reutersidf10k':
        N_class = 4
        batch_size = 100
        train_batch_size = 256
        X, Y = read_db(db + '_total', True)
        X = np.asarray(X, dtype=np.float64)
        Y = np.asarray(np.squeeze(Y), dtype=np.int32)
        N = X.shape[0]

    tmm_alpha = 1.0
    total_iters = (N - 1) / train_batch_size + 1
    if not update_interval:
        update_interval = total_iters
    Y_pred = np.zeros((Y.shape[0]))
    iters = 0
    seek = 0
    dim = 10

    acc_list = []

    while True:
        write_net(db, dim, N_class, "'{:08}'".format(0))
        if iters == 0:
            write_db(np.zeros((N, N_class)), np.zeros((N, )), 'train_weight')
            ret, net = extract_feature(
                'net.prototxt', 'exp/' + db + '/save_iter_100000.caffemodel',
                ['output'], N, True, 0)
            feature = ret[0].squeeze()

            gmm_model = TMM(N_class)
            gmm_model.fit(feature)
            net.params['loss'][0].data[0,
                                       0, :, :] = gmm_model.cluster_centers_.T
            net.params['loss'][1].data[0, 0, :, :] = 1.0 / gmm_model.covars_.T
        else:
            ret, net = extract_feature('net.prototxt', 'init.caffemodel',
                                       ['output'], N, True, 0)
            feature = ret[0].squeeze()

            gmm_model.cluster_centers_ = net.params['loss'][0].data[0,
                                                                    0, :, :].T

        Y_pred_last = Y_pred
        Y_pred = gmm_model.predict(feature).squeeze()
        acc, freq = cluster_acc(Y_pred, Y)
        acc_list.append(acc)
        nmi = normalized_mutual_info_score(Y, Y_pred)
        print freq
        print freq.sum(axis=1)
        print 'acc: ', acc, 'nmi: ', nmi
        print(Y_pred != Y_pred_last).sum() * 1.0 / N
        if (Y_pred != Y_pred_last).sum() < 0.001 * N:
            print acc_list
            return acc, nmi
        time.sleep(1)

        write_net(db, dim, N_class, "'{:08}'".format(seek))
        weight = gmm_model.transform(feature)

        weight = (weight.T / weight.sum(axis=1)).T
        bias = (1.0 / weight.sum(axis=0))
        bias = N_class * bias / bias.sum()
        weight = (weight**2) * bias
        weight = (weight.T / weight.sum(axis=1)).T
        print weight[:10, :]
        write_db(weight, np.zeros((weight.shape[0], )), 'train_weight')

        net.save('init.caffemodel')
        del net

        with open('solver.prototxt', 'w') as fsolver:
            fsolver.write("""net: "net.prototxt"
base_lr: 0.01
lr_policy: "step"
gamma: 0.1
stepsize: 100000
display: 10
max_iter: %d
momentum: 0.9
weight_decay: 0.0000
snapshot: 100
snapshot_prefix: "exp/test/save"
snapshot_after_train:true
solver_mode: GPU
debug_info: false
sample_print: false
device_id: 0""" % update_interval)
        os.system(
            'caffe train --solver=solver.prototxt --weights=init.caffemodel')
        shutil.copyfile('exp/test/save_iter_%d.caffemodel' % update_interval,
                        'init.caffemodel')

        iters += 1
        seek = (seek + train_batch_size * update_interval) % N