def my_dist(x ,y):
    #bottleneck distance between diagrams
    x=[list(x[0][item]) for item in range(len(x))]
    y=[list(y[0][item]) for item in range(len(y))]
    d= gudhi.bottleneck_distance(x,y)
    message = "Bottleneck distance value = " + '%.8f' % gudhi.bottleneck_distance(x,y)

    return d
예제 #2
0
def test_basic_bottleneck():
    diag1 = [[2.7, 3.7], [9.6, 14.0], [34.2, 34.974], [3.0, float("Inf")]]
    diag2 = [[2.8, 4.45], [9.5, 14.1], [3.2, float("Inf")]]

    assert gudhi.bottleneck_distance(diag1, diag2) == 0.75
    assert gudhi.bottleneck_distance(diag1, diag2,
                                     0.1) == pytest.approx(0.75, abs=0.1)
    assert gudhi.hera.bottleneck_distance(diag1, diag2, 0) == 0.75
    assert gudhi.hera.bottleneck_distance(diag1, diag2,
                                          0.1) == pytest.approx(0.75, rel=0.1)
def runExperimentWasserstein(w):
    #preprocessing
    dowjones_lr = getLogReturns("DowJones.csv")
    nasdaq_lr = getLogReturns("Nasdaq.csv")
    russel_lr = getLogReturns("Russell2000.csv")
    sp500_lr = getLogReturns("SP500.csv")
    dates = getDates("DowJones.csv")[w - 1:]

    #point clouds
    pointClouds = computePointClouds(w, dowjones_lr, nasdaq_lr, russel_lr,
                                     sp500_lr)

    #diagrams
    dgms = computePersistenceDiagrams(pointClouds)

    #convert the gudhi format into an array
    converted_dgms = [convert_dgm(dgm, 1) for dgm in dgms]

    #list of consecutive distances
    consecutive_bot = [
        gudhi.bottleneck_distance(converted_dgms[i], converted_dgms[i + 1])
        for i in range(len(converted_dgms) - 1)
    ]
    consecutive_bot_smoothed = smooth_curve(consecutive_bot, 10)

    consecutive_wass = [
        gudhi.wasserstein.wasserstein_distance(np.array(converted_dgms[i]),
                                               np.array(converted_dgms[i + 1]))
        for i in range(len(converted_dgms) - 1)
    ]
    consecutive_wass_smoothed = smooth_curve(consecutive_wass, 10)

    #plot
    plt.plot(consecutive_bot[2000:3100])
    plt.title(
        "Consecutive bottleneck distance between persistence diagram before the 2000 financial crisis (w="
        + str(w) + ")" + "\n \n" + str(dates[2000]) + " to " +
        str(dates[3100]))
    plt.show()

    plt.plot(consecutive_bot_smoothed[2000:3100])
    plt.title(
        "Consecutive bottleneck distance (smoothed) between persistence diagram before the 2000 financial crisis (w="
        + str(w) + ")" + "\n \n" + str(dates[2000]) + " to " +
        str(dates[3100]))
    plt.show()

    plt.plot(consecutive_wass[2000:3100])
    plt.title(
        "Consecutive Wasserstein distance between persistence diagram before the 2000 financial crisis (w="
        + str(w) + ")" + "\n \n" + str(dates[2000]) + " to " +
        str(dates[3100]))
    plt.show()

    plt.plot(consecutive_wass_smoothed[2000:3100])
    plt.title(
        "Consecutive Wasserstein distance distance (smoothed) between persistence diagram before the 2000 financial crisis (w="
        + str(w) + ")" + "\n \n" + str(dates[2000]) + " to " +
        str(dates[3100]))
    plt.show()
예제 #4
0
def bottleneck(diag1, diag2):
    diag1_simplicies = [[], []]
    diag2_simplicies = [[], []]
    for simplex in diag1:  #Separates the first simplicial complex into 0 and 1 simplicies
        diag1_simplicies[simplex[0]].append(simplex[1])
    for simplex in diag2:  #Separates the second simplicial complex into 0 and 1 simplicies
        diag2_simplicies[simplex[0]].append(simplex[1])
    b0 = gd.bottleneck_distance(
        diag1_simplicies[0],
        diag2_simplicies[0])  #Calculates the bottleneck distance for 0
    #simplicies
    b1 = gd.bottleneck_distance(
        diag1_simplicies[1],
        diag2_simplicies[1])  #Calculates the bottleneck distance for 1
    #simplicies
    return array([b0, b1]).T
def save_pairwise_btnck_dist(dgms, save_path, dimension):
    """Computes the bottleneck distances between each pair of diagrams.
    
    Parameters
    ----------
    dgms : array
        Persistence diagrams.
    save_path : string
        Path of the directory where to save the .csv file containing pairwise 
        bottleneck distances.
    dimension : int
        Dimension of the persistence diagrams to consider.

    """
    # Number of observations
    nb_individuals = dgms.shape[0]

    # Initialisation of the matrix to save
    pairwise_btnck_dist = np.zeros((nb_individuals, nb_individuals))

    # Compute pairwise bottleneck distances between i and j
    for i in range(nb_individuals):
        diag_i = dgms[i]
        for j in range(i + 1, nb_individuals):
            diag_j = dgms[j]
            bottleneck_dist = gudhi.bottleneck_distance(diag_i, diag_j)
            pairwise_btnck_dist[i][j] = bottleneck_dist
            pairwise_btnck_dist[j][i] = bottleneck_dist

    # Save as .csv under the name save
    name_file = os.path.join(save_path, "pairwise_btnck_dist_dim{}.csv"\
                             .format(dimension))
    pd.DataFrame(pairwise_btnck_dist).to_csv(name_file)
    print("Successfully saved in {}!".format(name_file))
예제 #6
0
 def compute_distance(self):
     start_time = time.time()
     node_dist, sent_dist = self.distance_matrix()
     diag_node = self.compute_diagram(node_dist, homology_dim=1)
     diag_sent = self.compute_diagram(sent_dist, homology_dim=1)
     print("Filtration graph computed in: %.3f" % (time.time() - start_time))
     return min([gudhi.bottleneck_distance(x, y, e=0) for (x, y) in zip(diag_node, diag_sent)])
 def computeBottleNeckDistance(self, instance, *args, **kwargs):
     import gudhi
     import numpy as np
     min_persistence = kwargs.get('min_persistence', 0)
     nPoints = kwargs.get('nPoints', self.nPoints)
     targetCluster = kwargs.get('targetCluster', [1])
     persistence = instance.computeSimplex(
         nPoints=nPoints, targetCluster=targetCluster)
     if persistence == []:
         return -1
     persistence.persistence(min_persistence=min_persistence)
     compareTo = self.computeSimplex(
         nPoints=nPoints, targetCluster=targetCluster)
     if compareTo == []:
         return -1
     compareTo.persistence(min_persistence=min_persistence)
     maxDistance = 0
     for dim in range(self.dimension):
         persistence_intervals = np.sqrt(
             persistence.persistence_intervals_in_dimension(dim))
         compareToIntervals = np.sqrt(
             compareTo.persistence_intervals_in_dimension(dim))
         bottleneck_distance = gudhi.bottleneck_distance(
             persistence_intervals, compareToIntervals)
         maxDistance = max(maxDistance, bottleneck_distance)
     return maxDistance
예제 #8
0
 def _bottleneck(self, diag1, diag2):
     """
     compute bottleneck distance between diagram and diagram_novelty
     :param diag1: persistent diagram for shape data in list form
     :param diag2: persistent diagram for cup dataset with novelty data and shape date in list form
     :return: bottleneck distance
     """
     return gudhi.bottleneck_distance(diag1, diag2, self.e)
예제 #9
0
def get_pairwise_bottleneck_distance_matrix(persistence_diagrams):
    distance_matrix = np.zeros((300,300))
    for i, pd1 in enumerate(tqdm(persistence_diagrams)):
        for j, pd2 in enumerate(persistence_diagrams[i+1:], i+1):
            
            distance_matrix[i,j] = distance_matrix[j,i] = gd.bottleneck_distance(pd1,pd2)
    
    return distance_matrix
예제 #10
0
    def transform(self, X):
        """
        Compute all bottleneck distances between the persistence diagrams that were stored after calling the fit() method, and a given list of (possibly different) persistence diagrams.

        Parameters:
            X (list of n x 2 numpy arrays): input persistence diagrams.

        Returns:
            Xfit (numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X)): matrix of pairwise bottleneck distances.
        """
        num_diag1 = len(X)

        try:
            from gudhi import bottleneck_distance

            if len(self.diagrams_) == len(X) and np.all([
                    np.array_equal(self.diagrams_[i], X[i])
                    for i in range(len(X))
            ]):
                matrix = np.zeros((num_diag1, num_diag1))

                for i in range(num_diag1):
                    #sys.stdout.write( str(i*1.0 / num_diag1) + "\r")
                    for j in range(i + 1, num_diag1):
                        matrix[i, j] = bottleneck_distance(
                            X[i], X[j], self.epsilon)
                        matrix[j, i] = matrix[i, j]

            else:
                num_diag2 = len(self.diagrams_)
                matrix = np.zeros((num_diag1, num_diag2))

                for i in range(num_diag1):
                    #sys.stdout.write( str(i*1.0 / num_diag1) + "\r")
                    for j in range(num_diag2):
                        matrix[i, j] = bottleneck_distance(
                            X[i], self.diagrams_[j], self.epsilon)

            Xfit = matrix

            return Xfit
        except ImportError:
            print(
                "This function is not available, you may be missing gudhi bottleneck_distance."
            )
예제 #11
0
def computeBottleneckDist(persis1, persis2, e=0):
    # bottleneck distance takes in list of [birth,death] lists, doesn't include dimension
    persis1list = []
    persis2list = []
    for pair in persis1:
        persis1list.append([pair[1][0], pair[1][1]])
    for pair in persis2:
        persis2list.append([pair[1][0], pair[1][1]])
    dist = gd.bottleneck_distance(persis1list, persis2list, e)
    return dist
예제 #12
0
    def transform(self, X):
        """
        Compute all bottleneck distances between the persistence diagrams that were stored after calling the fit() method, and a given list of (possibly different) persistence diagrams.

        Parameters:
            X (list of n x 2 numpy arrays): input persistence diagrams.

        Returns:
            Xfit (numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X)): matrix of pairwise bottleneck distances.
        """
        num_diag1 = len(X)

        if len(self.diagrams_) == len(X) and np.all(
            [np.array_equal(self.diagrams_[i], X[i]) for i in range(len(X))]):
            matrix = np.zeros((num_diag1, num_diag1))

            if USE_GUDHI:
                for i in range(num_diag1):
                    #sys.stdout.write( str(i*1.0 / num_diag1) + "\r")
                    for j in range(i + 1, num_diag1):
                        matrix[i, j] = bottleneck_distance(
                            X[i], X[j], self.epsilon)
                        matrix[j, i] = matrix[i, j]
            else:
                print("Gudhi required---returning null matrix")

        else:
            num_diag2 = len(self.diagrams_)
            matrix = np.zeros((num_diag1, num_diag2))

            if USE_GUDHI:
                for i in range(num_diag1):
                    #sys.stdout.write( str(i*1.0 / num_diag1) + "\r")
                    for j in range(num_diag2):
                        matrix[i, j] = bottleneck_distance(
                            X[i], self.diagrams_[j], self.epsilon)
            else:
                print("Gudhi required---returning null matrix")

        Xfit = matrix

        return Xfit
예제 #13
0
def getDistMat(listDiag):
    n=len(listDiag)
    dist_mat = np.zeros((n,n))
    i=0
    for diag1 in listDiag:
        j=0
        for diag2 in listDiag:
            if(j < i):
                db = gd.bottleneck_distance(diag1,diag2)
                dist_mat[i,j] = db
                dist_mat[j,i] = db
            j=j+1
        i=i+1
    return dist_mat
예제 #14
0
def compute_Bottleneck_distance(M, N, maxdim, dims):
    C = plot_persistence_diagram(M, maxdim)
    D = plot_persistence_diagram(N, maxdim)

    C1 = []
    for x, y in C:
        if sum(dims == x) > 0:
            C1.append(y)
    D1 = []
    for x, y in D:
        if sum(dims == x) > 0:
            D1.append(y)
    bottleneck_distance = gd.bottleneck_distance(C1, D1)
    return bottleneck_distance
def test_basic_bottleneck():
    diag1 = [[2.7, 3.7], [9.6, 14.0], [34.2, 34.974], [3.0, float("Inf")]]
    diag2 = [[2.8, 4.45], [9.5, 14.1], [3.2, float("Inf")]]

    assert gudhi.bottleneck_distance(diag1, diag2) == 0.75
    assert gudhi.bottleneck_distance(diag1, diag2,
                                     0.1) == pytest.approx(0.75, abs=0.1)
    assert gudhi.hera.bottleneck_distance(diag1, diag2, 0) == 0.75
    assert gudhi.hera.bottleneck_distance(diag1, diag2,
                                          0.1) == pytest.approx(0.75, rel=0.1)

    import numpy as np

    # Translating both diagrams along the diagonal should not affect the distance,
    # checks that negative numbers are not an issue
    diag1 = np.array(diag1) - 100
    diag2 = np.array(diag2) - 100

    assert gudhi.bottleneck_distance(diag1, diag2) == 0.75
    assert gudhi.bottleneck_distance(diag1, diag2,
                                     0.1) == pytest.approx(0.75, abs=0.1)
    assert gudhi.hera.bottleneck_distance(diag1, diag2, 0) == 0.75
    assert gudhi.hera.bottleneck_distance(diag1, diag2,
                                          0.1) == pytest.approx(0.75, rel=0.1)
예제 #16
0
    def compute_distribution(self, X, N=100):
        """
        Compute a bootstrap distribution of bottleneck distances. More specifically, subsample the input point cloud or distance matrix, compute the Mapper with the same parameters on this subsample, and compare its extended persistence diagrams with the original ones.

        Parameters:
            X (numpy array of shape (num_points) x (num_coordinates) if point cloud and (num_points) x (num_points) if distance matrix): input point cloud or distance matrix.
            N (int): bootstrap iterations (default 100).

        Returns:
            distribution: list of bottleneck distance values.
        """
        num_pts, distribution = len(X), []
        dgm = self.compute_persistence_diagrams()

        for bootstrap_id in range(N):

            print(str(bootstrap_id) + "th iteration")

            # Randomly select points
            idxs = np.random.choice(num_pts, size=num_pts, replace=True)
            Xboot = X[idxs, :] if self.input == "point cloud" else X[
                idxs, :][:, idxs]
            f_boot, c_boot = self.filters[idxs, :], self.colors[idxs, :]
            Mboot = self.__class__(filters=f_boot,
                                   filter_bnds=self.filter_bnds,
                                   colors=c_boot,
                                   resolutions=self.resolutions,
                                   gains=self.gains,
                                   inp=self.input,
                                   clustering=self.clustering).fit(Xboot)

            # Compute the corresponding persistence diagrams
            dgm_boot = Mboot.compute_persistence_diagrams()

            # Compute the bottleneck distances between them and keep the maximum
            df = 0.
            for i in range(len(dgm)):
                npts, npts_boot = len(dgm[i]), len(dgm_boot[i])
                D1 = np.array([[dgm[i][pt][1][0], dgm[i][pt][1][1]]
                               for pt in range(npts) if dgm[i][pt][0] <= 1])
                D2 = np.array([[dgm_boot[i][pt][1][0], dgm_boot[i][pt][1][1]]
                               for pt in range(npts_boot)
                               if dgm_boot[i][pt][0] <= 1])
                bottle = gd.bottleneck_distance(D1, D2)
                df = max(df, bottle)
            distribution.append(df)

        return np.sort(distribution)
    def get_bottleneck_distance(self, w_size):
        """Compute bottleneck distance of persistence landscape

        Parameters:
            w_size: size of the windows for the landscapes computations

        Returns:
            bottleneck: bottleneck distance
        """
        last_df = self.load_dataset(f'w{w_size}_bottleneck')
        if last_df is not None:
            bottleneck = last_df['bottleneck'].values.reshape(-1)
            return bottleneck

        length = self.df.shape[0] - w_size
        bottleneck = np.zeros(self.df.shape[0])
        prev_diagram_b = None

        message = f"Compute the bottleneck distance for a window of {w_size} " \
                f"on {length} points\n"
        sys.stdout.write(message + "-" * (len(message) - 1) + '\n')
        sys.stdout.flush()
        pb = ProgressBar(total=length)

        for idx in range(length):
            array_window = self.df.iloc[idx:idx + w_size, :].values
            rips_complex = gd.RipsComplex(points=array_window)
            simplex_tree = rips_complex.create_simplex_tree(max_dimension=2)
            current_diagram = simplex_tree.persistence(min_persistence=0)

            current_diagram_b = []
            for i in range(len(current_diagram)):
                if current_diagram[i][0] == 1:
                    current_diagram_b.append(
                        [current_diagram[i][1][0], current_diagram[i][1][1]])

            if prev_diagram_b != None:
                dist = gd.bottleneck_distance(current_diagram_b,
                                              prev_diagram_b)
                bottleneck[idx + w_size] = dist
            prev_diagram_b = current_diagram_b
            next(pb)

        df = pd.DataFrame({'bottleneck': bottleneck}, columns=['bottleneck'])
        self.save_dataset(df, f'w{w_size}_bottleneck')
        return bottleneck
예제 #18
0
def get_pairwise_bottleneck(barcode_list, barcode_key,
                            diag_val=np.nan,label_dict=None,
                            return_trial_and_label_dist=False):
    """
    Return pairwise bottle-neck distance of barcode list 
    barcode_list: barcode list, each element is a dictionary containing many dimensions as keys
    barcode_key: barcode key to access homology dimension 
    diag_val: [default: `np.nan`] diagonal values for the pairwise matrix  
    label_dict: [default: None] label dictionary of indices, 
                if not None, will process mean distance across different classes/labels 
    return_trial_and_label_dist: [default: False] if True will return both trial and label distance matrices, 
                else will only return the label distances 
    """
    n = len(barcode_list)
    pw_BN = np.empty((n,n))
    pw_BN[:] = diag_val
    for i in tqdm(range(n-1),desc='processing %s' %(barcode_key)):
        bars_i = barcode_list[i][barcode_key]
        for j in range(i+1,n):
            bars_j = barcode_list[j][barcode_key]
            pw_BN[i,j] = gd.bottleneck_distance(bars_i,bars_j)
            pw_BN[j,i] = pw_BN[i,j] 
            
    if label_dict is None:
        return pw_BN
    
    keys = list(label_dict.keys())  
    num_keys = len(keys)
    pw_labels = np.zeros((num_keys,num_keys))
    for i in range(num_keys):
        ind_ith = label_dict[keys[i]]
        for j in range(i,num_keys):
            ind_jth = label_dict[keys[j]]
            pw_labels[i,j] = np.nanmean(pw_BN[ind_ith,:][:,ind_jth])
            pw_labels[j,i] = pw_labels[i,j]
    
    if return_trial_and_label_dist: 
        return {'trial_mat': pw_BN, 'label_mat': pw_labels, 'labels': keys}
    else:
        return {'label_mat': pw_labels, 'labels': keys}
    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # A way to help you find the bes value of K is to plot the graph of K value and the corresponding error rate for dataset
    # lets first calculate the mean of error for all the predicted values where K ranges from 1 to 40
    error = []

    for i in range(1, 40):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train, y_train)
        pred_i = knn.predict(X_test)
        error.append(np.mean(pred_i != y_test))

    plt.figure(figsize=(12, 6))
    plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
             markerfacecolor='blue', markersize=10)
    plt.title('Error Rate K Value')
    plt.xlabel('K Value')
    plt.ylabel('Mean Error')

    # from the output we can see that the mean error is zeros, the advise is to play with the values of K to see how it impacts the accuracy
    # of predictions.
    return

def my_dist(x ,y): x=[lis t(x[0][item]) for item in range(len(x))]
    y=[li s t(y[0][item]) for item in range(len(y))]

    return gudhi.bottleneck_distance(x,y)
예제 #20
0
def bottleneck_distance_from_graph(graph0, graph1, weights0, weights1, down=True, shift=False):
    """
    Computes the bottleneck distance between graphs with node weights

    :param graph0, graph1: networkx Graph objects, nodes must be 0-indexed
    :param weights0, weights1: filtration values for nodes
    :param down: Whether to filter downwards from 1 by weight
    :param shift: Shift the barcodes so that the first class appears at 0
    """
    adjacency_graph0 = graph0
    adjacency_graph1 = graph1
    if min(list(graph0.nodes)+list(graph1.nodes)) != 0:
        raise ValueError("Graph must be 0-indexed!")
    #get a shift value if necessary
    if down and shift:
        shift0 = 1-max(weights0)
        shift1 = 1-max(weights1)
    elif shift:
        shift0 = 0-min(weights0)
        shift1 = 0-min(weights1)
    if not shift:
        shift0=0
        shift1=0
    #get weights in order and shifted
    new_weights0 = np.zeros(len(weights0))
    new_weights1 = np.zeros(len(weights1))
    if down:
        for i in range(len(weights0)):
            new_weights0[i] = 1-weights0[i]+shift0
        for i in range(len(weights1)):
            new_weights1[i] = 1-weights1[i]+shift1
    else:
        for i in range(len(weights0)):
            new_weights0[i] = weights0[i]+shift0
        for i in range(len(weights1)):
            new_weights1[i] = weights1[i]+shift1
    #generate filtered complex for partition0
    spCpx0 = gd.SimplexTree()
    for node in adjacency_graph0.nodes:
        spCpx0.insert([node])
    for edge in adjacency_graph0.edges:
        spCpx0.insert(list(edge))
    zero_skeleton = spCpx0.get_skeleton(0)
    for j in zero_skeleton:
        spCpx0.assign_filtration(
            j[0], filtration=new_weights0[j[0][0]])
    spCpx0.make_filtration_non_decreasing()
    #generate filtered complex for partition1
    spCpx1 = gd.SimplexTree()
    for node in adjacency_graph1.nodes:
        spCpx1.insert([node])
    for edge in adjacency_graph1.edges:
        spCpx1.insert(list(edge))
    zero_skeleton = spCpx1.get_skeleton(0)
    for j in zero_skeleton:
        spCpx1.assign_filtration(
            j[0], filtration=new_weights1[j[0][0]])
    spCpx1.make_filtration_non_decreasing()
    #compute persistent homology
    barcodes0 = spCpx0.persistence()
    barcodes1 = spCpx1.persistence()
    #compute bottleneck distance
    spCpx0.persistence()
    spCpx1.persistence()
    I0 = spCpx0.persistence_intervals_in_dimension(0)
    I1 = spCpx1.persistence_intervals_in_dimension(0)

    return gd.bottleneck_distance(I0,I1)
예제 #21
0
def evaluate_significance(dgm,
                          bnd,
                          X,
                          M,
                          func,
                          params,
                          topo_type="loop",
                          threshold=.9,
                          N=1000,
                          input="point cloud"):
    """
	Evaluate the statistical significance of each topological structure of a Mapper graph with bootstrap.

	Parameters:
		dgm (list of tuple (dimension, (vb, vd))): list containing the dimension and the coordinates of each topological structure.
		bnd (list of list of int): data points corresponding to each topological structure.
		X (numpy array of shape n x d if point cloud and n x n if distance matrix): input point cloud or distance matrix.
		M (mapper graph): Mapper (as computed by sklearn_tda).
		func (list): function used to compute the structures. It is either defined on the Mapper nodes (if func_type = "node") or on the input data (if func_type = "data"). If None, the function is computed with eccentricity.
		params (dictionary): parameters used to compute the original Mapper.
		topo_type (string): type of topological structure. Either "connected_components", "downbranch", "upbranch" or "loop".
		threshold (float): threshold on the statistical significance.
		N (int): number of bootstrap iterations.
		input (string): type of input data. Either "point cloud" or "distance matrix".

	Returns:
		dgmboot (list of tuple (dimension, (vb, vd))): subset of dgm with statistical significance above threshold.
		bndboot (list of list of int): subset of bnd with statistical significance above threshold.
	"""
    num_pts, distribution = len(X), []

    for bootstrap_id in range(N):

        # Randomly select points
        idxs = np.random.choice(num_pts, size=num_pts, replace=True)
        Xboot = X[idxs, :] if input == "point cloud" else X[idxs, :][:, idxs]
        f_boot = [func[i] for i in idxs]
        params_boot = {k: params[k] for k in params.keys()}
        params_boot["filters"] = params["filters"][idxs, :]
        params_boot["colors"] = params["colors"][idxs, :]
        Mboot = MapperComplex(**params_boot).fit(Xboot)

        # Compute the corresponding persistence diagrams
        dgm_boot, _ = compute_topological_features(Mboot,
                                                   func=f_boot,
                                                   func_type="data",
                                                   topo_type=topo_type)

        # Compute the bottleneck distances between them and keep the maximum
        npts, npts_boot = len(dgm), len(dgm_boot)
        D1 = np.array([[dgm[pt][1][0], dgm[pt][1][1]] for pt in range(npts)
                       if dgm[pt][0] <= 1])
        D2 = np.array([[dgm_boot[pt][1][0], dgm_boot[pt][1][1]]
                       for pt in range(npts_boot) if dgm_boot[pt][0] <= 1])
        bottle = gd.bottleneck_distance(D1, D2)
        distribution.append(bottle)

    distribution = np.sort(distribution)
    dist_thresh = distribution[int(threshold * len(distribution))]
    significant_idxs = [
        i for i in range(len(dgm))
        if dgm[i][1][1] - dgm[i][1][0] >= 2 * dist_thresh
    ]
    dgmboot, bndboot = [dgm[i] for i in significant_idxs
                        ], [bnd[i] for i in significant_idxs]
    return dgmboot, bndboot
예제 #22
0
def compute_distance(x, y, homo_dim = 1):
    start_time = time.time()
    diag_x = compute_diagram(x, homo_dim=homo_dim)
    diag_y = compute_diagram(y, homo_dim=homo_dim)
    #print("Filteration graph: %.3f" % (time.time() - start_time))
    return min([gudhi.bottleneck_distance(x, y, e=0) for (x, y) in zip(diag_x, diag_y)])
예제 #23
0
'''### Compute persistence diagrams ############################################################'''

corr_protein_1 = pd. read_csv(f'{dataDir}/1anf.corr_1.txt',
                            header=None,
                            delim_whitespace=True )
# corr_protein_1.head()
dist_protein_1 = 1 - np.abs( corr_protein_1.values )
rips_complex_1 = gd.RipsComplex(distance_matrix=dist_protein_1, max_edge_length =1.1)
simplex_tree_1 = rips_complex_1.create_simplex_tree(max_dimension=2)
diag_1 = simplex_tree_1.persistence()
gd.plot_persistence_diagram(diag_1)

'''### Compare persistence diagrams using bottleneck distance ###############'''

# 0-homologies
interv0_1 = simplex_tree_1.persistence_intervals_in_dimension(0)
interv0_2 = simplex_tree_2.persistence_intervals_in_dimension(0)
bot0 = gd.bottleneck_distance( interv0_1, interv0_2 )
# 1-homologies
interv1_1 = simplex_tree_1.persistence_intervals_in_dimension(1)
interv1_2 = simplex_tree_2.persistence_intervals_in_dimension(1)
bot1 = gd.bottleneck_distance( interv1_1 , interv1_2 )

mds = manifold.MDS( n_components=2, dissimilarity="precomputed")
config = mds.fit(M).embedding_

plt.scatter( conig[0:7, 0], config[0:7, 1], color='red', label='closed')
plt.scatter( conig[1:7, 0], config[7:1, 1], color='blue', label='red')
plt.legend(loc=1)
예제 #24
0
        )

        message = "Number of simplices=" + repr(alpha_stree.num_simplices())
        print(message)

        alpha_stree.compute_persistence()

        max_b_distance = 0.0
        for dim in range(args.max_dimension):
            # Alpha persistence values needs to be transform because filtration
            # values are alpha square values
            alpha_intervals = np.sqrt(alpha_stree.persistence_intervals_in_dimension(dim))

            rips_intervals = rips_stree.persistence_intervals_in_dimension(dim)
            bottleneck_distance = gudhi.bottleneck_distance(
                rips_intervals, alpha_intervals
            )
            message = (
                "In dimension "
                + repr(dim)
                + ", bottleneck distance = "
                + repr(bottleneck_distance)
            )
            print(message)
            max_b_distance = max(bottleneck_distance, max_b_distance)

        print("==============================================================")
        message = "Bottleneck distance is " + repr(max_b_distance)
        print(message)

    else:
예제 #25
0
#Compute persistence in dimension 1 for all the files. Visualize the persistence diagrams for some of them
persistence = []
for i in range(0, len(dist_list)):
    rips_complex = gd.RipsComplex(distance_matrix=dist_list[i].values,
                                  max_edge_length=0.8)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=2)
    simplex_tree.persistence()
    persistence.append(simplex_tree.persistence_intervals_in_dimension(0))

#And compute all-to-all bottleneck distances. Note that this part will take a few seconds:
dist_mat = []
for i in range(0, len(persistence)):
    row = []
    for j in range(0, len(persistence)):
        row.append(gd.bottleneck_distance(persistence[i], persistence[j]))
    dist_mat.append(row)

#We will now use a dimension reduction method to
#visualize a configuration in  R^2  which almost
#matches with the matrix of bottleneck distances.
#For that purpose we will apply a Multidimensional
#Scaling method implemented in the scikit-learn library.

mds = manifold.MDS(n_components=2,
                   max_iter=3000,
                   eps=1e-9,
                   dissimilarity="precomputed",
                   n_jobs=1)
pos = mds.fit(dist_mat).embedding_
예제 #26
0
def get_bottleneck_dist(b1, b2, e=0):
    ''' wrapping gudhi in a function for ease;'''
    return g.bottleneck_distance(b1, b2, e)
def test_basic_bottleneck():
    diag1 = [[2.7, 3.7], [9.6, 14.0], [34.2, 34.974], [3.0, float("Inf")]]
    diag2 = [[2.8, 4.45], [9.5, 14.1], [3.2, float("Inf")]]

    assert gudhi.bottleneck_distance(diag1, diag2, 0.1) == 0.8081763781405569
    assert gudhi.bottleneck_distance(diag1, diag2) == 0.75
예제 #28
0
    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
    Author(s):       Vincent Rouvreau

    Copyright (C) 2016 Inria

    Modification(s):
      - YYYY/MM Author: Description of the modification
"""

__author__ = "Francois Godi, Vincent Rouvreau"
__copyright__ = "Copyright (C) 2016 Inria"
__license__ = "MIT"

diag1 = [[2.7, 3.7], [9.6, 14.0], [34.2, 34.974], [3.0, float("Inf")]]

diag2 = [[2.8, 4.45], [9.5, 14.1], [3.2, float("Inf")]]

message = "diag1=" + repr(diag1)
print(message)

message = "diag2=" + repr(diag2)
print(message)

message = "Bottleneck distance approximation=" + repr(
    gudhi.bottleneck_distance(diag1, diag2, 0.1))
print(message)

message = "Bottleneck distance exact value=" + repr(
    gudhi.bottleneck_distance(diag1, diag2))
print(message)
예제 #29
0
    Lope_embedding = np.stack([
        word_vectors[word] for word in vocab_embedding if word in vocabularyL
    ])
    k = np.min(
        [len(Quevedo_embedding),
         len(Gongora_embedding),
         len(Lope_embedding)])
    Quevedo_embedding = Quevedo_embedding[0:k]
    Gongora_embedding = Gongora_embedding[0:k]
    Lope_embedding = Lope_embedding[0:k]

    dgmsQ = r.ripser(Quevedo_embedding, metric="cosine")['dgms']
    dgmsG = r.ripser(Gongora_embedding, metric="cosine")['dgms']
    dgmsL = r.ripser(Lope_embedding, metric="cosine")['dgms']

    pcos1 = g.bottleneck_distance(dgmsG[0], dgmsQ[0])
    pcos2 = g.bottleneck_distance(dgmsG[0], dgmsL[0])
    pcos3 = g.bottleneck_distance(dgmsL[0], dgmsQ[0])
    pcos1s.append(pcos1)
    pcos2s.append(pcos2)
    pcos3s.append(pcos3)
    print("COSINE DISTANCE")
    print("Quevedo and Gongora: ", pcos1)
    print("Lope and Gongora: ", pcos2)
    print("Quevedo and Lope: ", pcos3)

print("MEAN COSINE DISTANCE")
print("Mean Quevedo and Gongora:", np.mean(pcos1s))
print("Mean Lope and Gongora:", np.mean(pcos2s))
print("Mean Quevedo and Lope:", np.mean(pcos3s))
def machine_learning(d, diagramas):
    dataset = pd.read_csv('prueba5.csv', delimiter=';', encoding='latin-1', keep_default_na=False)
    # y = dataset.iloc[:, 1].values

    test = sio.loadmat('matrix_total.mat')  # matrix of value

    X_m = test['matrix_total']
    y = X_m[:, 8]
    X_m = X_m[:, :8]

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(diagramas, y, test_size=0.20)
    X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y, test_size=0.20)

    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    N = len(X_train)
    D = np.zeros((N, N))
    M = len(X_test)
    D1 = np.zeros((M, M))

    for i in range(N):
        for j in range(i + 1, N):
            D[i, j] = my_dist(X_train[i], X_train[j])
            D[j, i] = D[i, j]

    for i in range(M):
        for j in range(i + 1, M):
            D1[i, j] = my_dist(X_test[i], X_test[j])
            D1[j, i] = D1[i, j]


    message = "Bottleneck distance approximation = " + '%.8f' % gudhi.bottleneck_distance(a_Del, a_Del2, 0.1)
    message = "Bottleneck distance value = " + '%.8f' % gudhi.bottleneck_distance(a_Del,a_Del2)
    gudhi.plot_persistence_diagram(a_Del2)


    # Create the fake data matrix: just the indices of the timeseries

    X = np.arange(N).reshape((N, 1))
    knn = KNeighborsClassifier(n_neighbors=5)

    # knn = KNeighborsClassifier(weights='distance', algorithm='ball_tree', metric='pyfunc')

    knn.fit(D, y_train)
    gnb = GaussianNB().fit(D, y_train)
    dtree_model = DecisionTreeClassifier(max_depth=2).fit(D, y_train)
    svm_model_linear = SVC(kernel='linear', C=1).fit(D, y_train)
    y_pred = knn.predict(D)
    gnb_predictions = gnb.predict(D)
    accuracy0 = knn.score(D ,y_train)
    accuracy1 = gnb.score(D, y_train)
    accuracy2 = dtree_model.score(D, y_train)
    accuracy3 = svm_model_linear.score(D, y_train)

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_train, y_pred))
    print(classification_report(y_train, y_pred))

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    print(knn.kneighbors(D[0]))
    print(knn.predict(D))

    # For evaluating the algorithm, confusion matrix, precision, recall and f1 score are the most commonly used

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(X_train, y_pred))
    print(classification_report(y_test, y_pred))

    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    from sklearn.neighbors import DistanceMetric

    dt = DistanceMetric.get_metric('pyfunc', func=my_dist)

    from sklearn.neighbors import NearestNeighbors

    nbrs = NearestNeighbors(n_neighbors=4, algorithm='auto', metric='pyfunc').fit(diagramas)
    distances, indices = nbrs.kneighbors(diagramas)

    NearestNeighbors(n_neighbors=4, algorithm='auto', metric='pyfunc')

    classifier.fit(diagramas, y)


    dataset = pd.read_csv('prueba5.csv', delimiter=';', encoding='latin-1', keep_default_na=False)

    dataset['Entropy'] = [float(dataset['Entropy'][item].replace( ',', '.')) for item in range(len(dataset))]

    X = d.iloc[:, :-2].values
    y = dataset.iloc[:, 1].values
    from sklearn.model_selection import train_test_split

    # TO create training and test splits , execute the following scritp
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    # the above script splits the dataset into 80% and 20% test data,
    # Feature scaling
    # before making any actual predictions, it is always a good practice to scale the features so that all of them can be uniformly
    # evaluated.
    # we are going to use the gradient descent algorithm

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    from sklearn.neighbors import KNeighborsClassifier

    classifier = KNeighborsClassifier(n_neighbors=40)
    classifier.fit(X_train, y_train)

    # the first step is to import the KNeighborsClassifier from library, in the second line this classifier is initialized with
    # one parameter. THis is basically the value for the K. There is no ideal value for K and it is selected after testing and evaluation

    # Final step is to make predictions on our test data. to do so, execute the following script

    y_pred = classifier.predict(X_test)

    # For evaluating the algorithm, confusion matrix, precision, recall and f1 score are the most commonly used

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # A way to help you find the bes value of K is to plot the graph of K value and the corresponding error rate for dataset
    # lets first calculate the mean of error for all the predicted values where K ranges from 1 to 40
    error = []

    for i in range(1, 40):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train, y_train)
        pred_i = knn.predict(X_test)
        error.append(np.mean(pred_i != y_test))

    plt.figure(figsize=(12, 6))
    plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
             markerfacecolor='blue', markersize=10)
    plt.title('Error Rate K Value')
    plt.xlabel('K Value')
    plt.ylabel('Mean Error')

    # from the output we can see that the mean error is zeros, the advise is to play with the values of K to see how it impacts the accuracy
    # of predictions.
    return