def test_dijkstra_bug_fix(): X = np.array([[0., 0., 4.], [1., 0., 2.], [0., 5., 0.]]) dist_FW = graph_shortest_path(X, directed=False, method='FW') dist_D = graph_shortest_path(X, directed=False, method='D') assert_array_almost_equal(dist_D, dist_FW)
def test_dijkstra(): dist_matrix = generate_graph(20) for directed in (True, False): graph_D = graph_shortest_path(dist_matrix, directed, 'D') graph_py = floyd_warshall_slow(dist_matrix.copy(), directed) assert_array_almost_equal(graph_D, graph_py)
def isomap(X, n_neighbors, metric): """ Based on sklearn, Author: Jake Vanderplas -- <*****@*****.**> License: BSD, (C) 2011 """ kng = kneighbors_graph(D, n_neighbors = n_neighbors, metric = metric) dist_matrix_ = graph_shortest_path(kng, method='auto', directed=False) kernel_pca_ = KernelPCA(n_components=2, kernel="precomputed", eigen_solver='auto') G = dist_matrix_ ** 2 G *= -0.5 return kernel_pca_.fit_transform(G)
def _fit_transform(self, X): X = check_array(X) self.nbrs_.fit(X) self.training_data_ = self.nbrs_._fit_X self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter) kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode='distance') self.dist_matrix_ = graph_shortest_path(kng, method=self.path_method, directed=False) G = self.dist_matrix_ ** 2 G *= -0.5 self.embedding_ = self.kernel_pca_.fit_transform(G)
from scipy import sparse def generate_graph(N=20): rng = np.random.RandomState(0) dist_matrix = rng.random_sample((N, N)) dist_matrix = dist_matrix + dist_matrix.T i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2)) dist_matrix[i] = 0 dist_matrix.flat[::N + 1] = 0 return dist_matrix dist_matrix = sparse.csr_matrix(generate_graph(20)) # auto graph_sk = graph_shortest_path(dist_matrix, directed = False) graph_sp = shortest_path(dist_matrix, directed = False) assert_array_almost_equal(graph_sk, graph_sp) # Floyd-Warshall graph_sk = graph_shortest_path(dist_matrix, directed = False, method = 'FW') graph_sp = shortest_path(dist_matrix, directed = False, method = 'FW') assert_array_almost_equal(graph_sk, graph_sp) # Dijkstra's graph_sk = graph_shortest_path(dist_matrix, directed = False, method = 'D') graph_sp = shortest_path(dist_matrix, directed = False, method = 'D') assert_array_almost_equal(graph_sk, graph_sp)
def test_dijkstra_bug_fix(): X = np.array([[0.0, 0.0, 4.0], [1.0, 0.0, 2.0], [0.0, 5.0, 0.0]]) dist_FW = graph_shortest_path(X, directed=False, method="FW") dist_D = graph_shortest_path(X, directed=False, method="D") assert_array_almost_equal(dist_D, dist_FW)
def test_graph_shortest_path_deprecation(): dist_matrix = generate_graph(20) with pytest.warns(FutureWarning, match="deprecated"): _ = graph_shortest_path(dist_matrix)
def gen_geo_dists(pc): graph = neighbors.kneighbors_graph(pc, 20, mode='distance', include_self=False) return graph_shortest_path(graph, directed=False)
def isomap(data, component=2, neighbor=50): data = calculate_dist(data, neighbor) graph = graph_shortest_path(data, directed=False) graph = -0.5 * (graph**2) return MDS(graph, component)
if title is not None: plt.title(title) print("Computing Isomap embedding") t0 = time() n_jobs = None nbrs_ = NearestNeighbors(n_neighbors=n_neighbors).fit(X) training_data_ = nbrs_._fit_X kernel_pca_ = KernelPCA(n_components=2, tol=0) kng = kneighbors_graph(nbrs_, n_neighbors, mode='distance') dist_matrix_ = graph_shortest_path(kng, directed=False) dist_matrix_s = dist_matrix_.copy() d = dist_matrix_s.max() beta = d * d * 0.1 alpha = 0 for i in range(n_samples): for j in range(n_samples): d = dist_matrix_[i, j] if y[i] == y[j]: dist_matrix_s[i, j] = sqrt(1 - exp(-d * d / beta)) else: dist_matrix_s[i, j] = sqrt(exp(d * d / beta)) - alpha # for i in range(int(n_samples*p)): # for j in range(int(n_samples*p)):
def main(argv): parser = argparse.ArgumentParser(epilog="NOTE: it is important to have a smooth histogram for accurate fitting\n\n") parser.add_argument("filename", help="input filename") parser.add_argument("-m", "--metric" , type=str, help="define the scipy distance to be used (Default: euclidean or hamming for MSA)",default='euclidean') parser.add_argument("-x", "--matrix", help="if the input file contains already the complete upper triangle of a distance matrix (2 Formats: (idx_i idx_j distance) or simply distances list ) (Opt)", action="store_true") parser.add_argument("-k", "--n_neighbors", type=int, help="nearest_neighbors parameter (Default k=3)", default=3) parser.add_argument("-r", "--radius", type=float, help="use neighbor radius instead of nearest_neighbors (Opt)",default=0.) parser.add_argument("-b", "--n_bins", type=int, help="number of bins for distance histogram (Default 50)",default=50) parser.add_argument("-M", "--r_max", type=float, help="fix the value of distance distribution maximum in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",default=0) parser.add_argument("-n", "--r_min", type=float, help="fix the value of shortest distance considered in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",default=-10) parser.add_argument("-D", "--direct", help="analyze the direct (not graph) distances (Opt)", action="store_true") parser.add_argument("-I", "--projection", help="produce an Isomap projection using the first ID components (Opt)", action="store_true") args = parser.parse_args() input_f = args.filename me=args.metric n_neighbors = args.n_neighbors radius=args.radius+0 MSA=False n_bins = args.n_bins rmax=args.r_max mm=-10000 print '\nFile name: ', input_f #0 Reading input file f1 = open(input_f) data = [] data_line = [] labels = [] for line in f1: if line[0]==">" : MSA=True labels.append(line) if line[0]!=">" and MSA==True : data.append([ord(x) for x in line[:-1]]) data_line.append(line) elif line[0]!="#" and MSA==False : data.append([float(x) for x in line.split()]) data_line.append(line) f1.close() data = n.asarray(data) if MSA : me='hamming' if args.matrix : me='as from the input file' print 'Metric: ', me if radius>0. and (args.direct==False) : print 'Nearest Neighbors Radius:', radius elif (args.direct==False): print 'Nearest Neighbors number K: ', n_neighbors else : print 'Distance distribution are calculated based on the direct input-space distances ' if radius>0. : filename = str(input_f.split('.')[0])+'R'+str(radius) else : filename = str(input_f.split('.')[0])+'K'+str(n_neighbors) #0 #1 Computing geodesic distance on connected points of the input file and relative histogram if args.matrix : if data.shape[1] == 1 : dist_mat=distance.squareform(data.ravel()) mm=dist_mat.shape[1] elif data.shape[1] == 3 : mm=int(max(data[:,1])) dist_mat=n.zeros((mm,mm)) for i in range(0,data.shape[0]): dist_mat[int(data[i,0])-1,int(data[i,1])-1]=data[i,2] dist_mat[int(data[i,1])-1,int(data[i,0])-1]=data[i,2] else : print 'ERROR: The distances input is not in the right matrix format' ; sys.exit(2) print "\n# points: ", mm A=n.zeros((mm,mm)) rrr=[] if radius > 0. : for i in range(0,mm): ll=dist_mat[i] < radius A[i,ll]=dist_mat[i,ll] else : rrr=n.argsort(dist_mat) for i in range(0,mm): ll=rrr[i,0:n_neighbors+1] A[i,ll]=dist_mat[i,ll] radius = A.max() if args.direct : C=dist_mat else : C= graph_shortest_path(A,directed=False) else : print "\n# points, coordinates: ", data.shape if args.direct : C=distance.squareform(distance.pdist(data,me)); elif radius>0. : A = radius_neighbors_graph(data, radius,metric=me,mode='distance') C= graph_shortest_path(A,directed=False) else : A = kneighbors_graph(data, n_neighbors,metric=me,mode='distance') C= graph_shortest_path(A,directed=False) radius=A.max() C=n.asmatrix(C) connect=n.zeros(C.shape[0]) conn=n.zeros(C.shape[0]) for i in range(0,C.shape[0]) : conn_points=n.count_nonzero(C[i]) conn[i]=conn_points if conn_points > C.shape[0]/2. : connect[i]=1 else : C[i]=0 if n.count_nonzero(connect) > C.shape[0]/2. : print 'Number of connected points:', n.count_nonzero(connect), '(',100*n.count_nonzero(connect)/C.shape[0],'% )' else : print 'The neighbors graph is highly disconnected, increase K or Radius parameters' ; sys.exit(2) if n.count_nonzero(connect) < data.shape[0] : data_connect_file = open('connected_data_{0}.dat'.format(filename), "w") for i in range(0,C.shape[0]) : if connect[i]==1 : if MSA : data_connect_file.write(labels[i]) data_connect_file.write(data_line[i]) data_connect_file.close() indices = n.nonzero(n.triu(C,1)) dist_list = n.asarray( C[indices] )[-1] dist_file= open('dist_{0}.dat'.format(filename), "w") for i in range(0, len(dist_list)): dist_file.write("%s " % ((dist_list[i]))) dist_file.close() h=n.histogram(dist_list,n_bins) dx=h[1][1]-h[1][0] plt.figure(1) plt.plot(h[1][0:n_bins]+dx/2,h[0],'o-',label='histogram') plt.xlabel('r') plt.ylabel('N. counts') plt.legend() plt.savefig(filename+'_hist.png') distr_x = [] distr_y = [] avg=n.mean(dist_list) std=n.std(dist_list) if rmax> 0 : avg=rmax std=min(std,rmax) print '\nNOTE: You fixed r_max for the initial fitting, average will have the same value' else : mm=n.argmax(h[0]) rmax=h[1][mm]+dx/2 if args.r_max== -1 : print '\nNOTE: You forced r_max to the maximum of the distribution in the initial fitting, avoiding consistency checks with the average' avg=rmax std=min(std,rmax) if args.r_min>= 0 : print '\nNOTE: You fixed r_min for the initial fitting: r_min = ',args.r_min if args.r_min== -1 : print '\nNOTE: You forced r_min to the standard procedure in the initial fitting' print '\nDistances Statistics:' print 'Average, standard dev., n_bin, bin_size, r_max, r_NN_max:', avg , std, n_bins, dx, rmax, radius,'\n' #1 tmp=1000000 if(args.r_min>=0) : tmp=args.r_min elif(args.r_min==-1) : tmp=rmax-std if(n.fabs(rmax-avg)>std+2.*dx) : print 'ERROR: There is a problem with the r_max detection:' print ' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)' print ' or r_max and r_avg are too distant and you may consider to fix the first detection of r_max with option -M' print ' or to change the neighbor parameter with (-r/-k)' plt.show() sys.exit() elif(rmax<= min(radius+dx,tmp)) : print 'ERROR: There is a problem with the r_max detection, it is shorter than the largest distance in the neighbors graph.' print ' You may consider to fix the first detection of r_max with option -M and/or the r_min with option -n to fix the fit range' print ' or to decrease the neighbors parameter with (-r/-k). For example It is possible to enforce the standard fit range with ' print ' r_min=r_max-2*sigma running option "-n -1"' plt.show() sys.exit() #2 Finding actual r_max and std. dev. to define fitting interval [rmin;rM] distr_x=h[1][0:n_bins]+dx/2 distr_y=h[0][0:n_bins] res= n.empty(25) left_distr_x = n.empty(n_bins) left_distr_y = n.empty(n_bins) left_distr_x= distr_x[n.logical_and(n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.0),distr_y[:]>0.000001)] left_distr_y= n.log(distr_y[n.logical_and(n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.0),distr_y[:]>0.000001)]) if(left_distr_y.shape[0]<4) : print('ERROR: Too few datapoints to fit the distribution:') print(' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)') print(' or the distance distribution itself has some issue') plt.show() print('R, Dfit, Dmin', 'ERROR3' , '\n') sys.exit() coeff = n.polyfit(left_distr_x,left_distr_y,2,full='False') a0=coeff[0][0] b0=coeff[0][1] c0=coeff[0][2] rmax_old=rmax std_old=std rmax = -b0/a0/2.0 if(args.r_max>0) : rmax=args.r_max #if(args.r_max==-1) : rmax=avg #to be used in future in case of problem with Ymax if a0<0 and n.fabs(rmax-rmax_old)<std_old/2+dx : std=n.sqrt(-1/a0/2.) else: rmax=avg std=std_old left_distr_x= distr_x[n.logical_and(distr_y[:]>0.000001,n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.+dx))] left_distr_y= n.log(distr_y[n.logical_and(distr_y[:]>0.000001, n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.+dx))]) if(left_distr_y.shape[0]<4) : print('ERROR: Too few datapoints to fit the distribution:') print(' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)') print(' or the distance distribution itself has some issue') plt.show() sys.exit() coeff = n.polyfit(left_distr_x,left_distr_y,2,full='False') a=coeff[0][0] b=coeff[0][1] c=coeff[0][2] rmax_old=rmax std_old=std if a<0. : rmax = -b/a/2. std=n.sqrt(-1/a/2.) # it was a0 rmin=max(rmax-2*std-dx/2,0.) if(args.r_min>=0) : rmin=args.r_min elif (rmin < radius and args.r_min!=-1) : rmin = radius print '\nWARNING: For internal consistency r_min has been fixed to the largest distance (r_NN_max) in the neighbors graph.' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM=rmax+dx/4 if(n.fabs(rmax-rmax_old)>std_old/4+dx ) : #fit consistency check print '\nWARNING: The histogram is probably not smooth enough (you may try to change n_bin with -b), rmax is fixed to the value of first iteration\n' rmax=rmax_old a=a0 b=b0 c=c0 if(args.r_min>=0) : rmin=args.r_min elif (rmin < radius and args.r_min!=-1) : rmin = radius print '\nWARNING2: For internal consistency r_min has been fixed to the largest distance in the neighbors graph (r_NN_max).' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM=rmax+dx/4 #2 #3 Gaussian Fitting to determine ratio R left_distr_x= distr_x[n.logical_and(n.logical_and(distr_x[:]>rmin,distr_x[:]<=rM),distr_y[:]>0.000001)]/rmax left_distr_y= n.log(distr_y[n.logical_and(n.logical_and(distr_x[:]>rmin,distr_x[:]<=rM),distr_y[:]>0.000001)])-(4*a*c-b**2)/4./a if(left_distr_y.shape[0]<4) : print('ERROR: Too few datapoints to fit the distribution:') print(' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)') print(' or the distance distribution itself has some issue') plt.show() sys.exit() fit = curve_fit(func2,left_distr_x,left_distr_y) ratio=n.sqrt(fit[0][0]) y1=func2(left_distr_x,fit[0][0]) #3 #4 Geodesics D-Hypersphere Distribution Fitting to determine Dfit fit = curve_fit(func,left_distr_x,left_distr_y) Dfit=(fit[0][0])+1 y2=func(left_distr_x,fit[0][0],fit[0][1],fit[0][2]) #4 #5 Determination of Dmin D_file = open('D_residual_{0}.dat'.format(filename), "w") for D in range(1,26): y=(func(left_distr_x,D-1,1,0)) for i in range(0, len(y)): res[D-1] = n.linalg.norm((y)-(left_distr_y))/n.sqrt(len(y)) D_file.write("%s " % D) D_file.write("%s\n" % res[D-1]) Dmin = n.argmax(-res)+1 y=func(left_distr_x,Dmin-1,fit[0][1],0) #5 #6 Printing results print '\nFITTING PARAMETERS:' print 'rmax, std. dev., rmin', rmax,std,rmin print '\nFITTING RESULTS:' print 'R, Dfit, Dmin', ratio,Dfit,Dmin , '\n' if(Dmin == 1) : print 'NOTE: Dmin = 1 could indicate that the choice of the input parameters is not optimal or simply an underestimation of a 2D manifold\n' if(Dfit > 25) : print('NOTE: Dfit > 25 could indicate that the choice of the input parameters is not optimal or that the the distance distribution itself has some issue \n') fit_file= open('fit_{0}.dat'.format(filename), "w") for i in range(0, len(y)): fit_file.write("%s " % left_distr_x[i]) fit_file.write("%s " % ((left_distr_y[i]))) fit_file.write("%s " % ((y1[i]))) fit_file.write("%s " % ((y2[i]))) fit_file.write("%s\n" % ((y[i]))) fit_file.close() stat_file= open('statistics_{0}.dat'.format(filename), "w") statistics = str('# Npoints, rmax, standard deviation, R, D_fit, Dmin \n# \ {}, {}, {}, {}, {}, {}\n'.format(n.count_nonzero(connect),rmax,std,ratio,Dfit,Dmin)) stat_file.write("%s" % statistics) for i in range(0, len(distr_x)-2): if distr_y[i]>0.000001 : stat_file.write("%s " % distr_x[i]) stat_file.write("%s " % distr_y[i]) stat_file.write("%s\n" % n.log(distr_y[i])) stat_file.close() plt.figure(2) plt.plot(left_distr_x,left_distr_y,'o-',label=str(input_f.split('.')[0])) plt.plot(left_distr_x,y1,label='Gaussian fit for R ratio') plt.plot(left_distr_x,y2,label='D-Hypersphere Fit for D_fit') plt.plot(left_distr_x,y,label='D_min-Hypersphere Distribution') plt.xlabel('r/r$_{max}$') plt.ylabel('log p(r)/p(r$_{max}$)') plt.legend(loc=4) plt.savefig(str(input_f.split('.')[0])+'_fit.png') plt.figure(3) plt.plot(range(1,26),res,'o-',label=str(input_f.split('.')[0])+' D_min') plt.legend() plt.xlabel('D') plt.ylabel('RMDS') plt.show() plt.savefig(str(input_f.split('.')[0])+'_Dmin.png') #6 #7 Optional: Isomap projection if args.projection : from sklearn.decomposition import KernelPCA C2=(distance.squareform(dist_list))**2 C2=-.5*C2 obj_pj=KernelPCA(n_components=100,kernel="precomputed") proj=obj_pj.fit_transform(C2) n.savetxt('proj_'+str(input_f.split('.')[0])+'.dat',proj[:,0:Dmin+1]) print 'NOTE: it is important to have a smooth histogram for accurate fitting\n'
def isomap(data,output_dim,k): distance_matrix = d_mat(data,k) graph = graph_shortest_path(distance_matrix,directed=False,method='FW') return MDS_func(graph,output_dim)
def shortest_path(): A = np.array([[0, 5, 0, 7], [0, 0, 4, 2], [3, 3, 0, 2], [0, 0, 1, 0]]) distance = graph_shortest_path(A, directed=True) print(distance)
def main(argv): parser = argparse.ArgumentParser( epilog= "NOTE: it is important to have a smooth histogram for accurate fitting\n\n" ) parser.add_argument("filename", help="input filename") parser.add_argument( "-m", "--metric", type=str, help= "define the scipy distance to be used (Default: euclidean or hamming for MSA)", default='euclidean') parser.add_argument( "-x", "--matrix", help= "if the input file contains already the complete upper triangle of a distance matrix (2 Formats: (idx_i idx_j distance) or simply distances list ) (Opt)", action="store_true") parser.add_argument("-k", "--n_neighbors", type=int, help="nearest_neighbors parameter (Default k=3)", default=3) parser.add_argument( "-r", "--radius", type=float, help="use neighbor radius instead of nearest_neighbors (Opt)") parser.add_argument( "-b", "--n_bins", type=int, help="number of bins for distance histogram (Default 50)", default=50) parser.add_argument( "-M", "--r_max", type=float, help="fix the value of distance distribution maximum in the fit (Opt)", default=0) parser.add_argument( "-n", "--r_min", type=float, help= "fix the value of shortest distance considered in the fit (Opt, -1 force the standard fit, avoiding consistency checks)", default=-10) parser.add_argument("-D", "--direct", help="analyze the direct (not graph) distances (Opt)", action="store_true") parser.add_argument( "-I", "--projection", help="produce an Isomap projection using the first ID components (Opt)", action="store_true") args = parser.parse_args() #print args input_f = args.filename me = args.metric n_neighbors = args.n_neighbors radius = args.radius MSA = False n_bins = args.n_bins rmax = args.r_max mm = -10000 print '\nFile name: ', input_f #0 Reading input file f1 = open(input_f) data = [] data_line = [] labels = [] for line in f1: if line[0] == ">": MSA = True labels.append(line) if line[0] != ">" and MSA == True: data.append([ord(x) for x in line[:-1]]) data_line.append(line) elif line[0] != "#" and MSA == False: data.append([float(x) for x in line.split()]) data_line.append(line) f1.close() data = n.asarray(data) if MSA: me = 'hamming' if args.matrix: me = 'as from the input file' print 'Metric: ', me if radius > 0. and (args.direct == False): print 'Nearest Neighbors Radius:', radius elif (args.direct == False): print 'Nearest Neighbors number K: ', n_neighbors else: print 'Distance distribution are calculated based on the direct input-space distances ' if radius > 0.: filename = str(input_f.split('.')[0]) + 'R' + str(radius) else: filename = str(input_f.split('.')[0]) + 'K' + str(n_neighbors) #0 #1 Computing geodesic distance on connected points of the input file and relative histogram if args.matrix: if data.shape[1] == 1: dist_mat = distance.squareform(data.ravel()) mm = dist_mat.shape[1] elif data.shape[1] == 3: mm = int(max(data[:, 1])) dist_mat = n.zeros((mm, mm)) for i in range(0, data.shape[0]): dist_mat[int(data[i, 0]) - 1, int(data[i, 1]) - 1] = data[i, 2] dist_mat[int(data[i, 1]) - 1, int(data[i, 0]) - 1] = data[i, 2] else: print 'ERROR: The distances input is not in the right matrix format' sys.exit(2) print "\n# points: ", mm A = n.zeros((mm, mm)) rrr = [] if args.direct: C = dist_mat if radius > 0.: for i in range(0, mm): ll = dist_mat[i] < radius A[i, ll] = dist_mat[i, ll] else: rrr = n.argsort(dist_mat) for i in range(0, mm): ll = rrr[i, 0:n_neighbors + 1] A[i, ll] = dist_mat[i, ll] radius = A.max() C = graph_shortest_path(A, directed=False) else: print "\n# points, coordinates: ", data.shape if args.direct: C = distance.squareform(distance.pdist(data, me)) elif radius > 0.: A = radius_neighbors_graph(data, radius, metric=me, mode='distance') C = graph_shortest_path(A, directed=False) else: A = kneighbors_graph(data, n_neighbors, metric=me, mode='distance') C = graph_shortest_path(A, directed=False) radius = A.max() C = n.asmatrix(C) connect = n.zeros(C.shape[0]) conn = n.zeros(C.shape[0]) for i in range(0, C.shape[0]): conn_points = n.count_nonzero(C[i]) conn[i] = conn_points if conn_points > C.shape[0] / 2.: connect[i] = 1 else: C[i] = 0 if n.count_nonzero(connect) > C.shape[0] / 2.: print 'Number of connected points:', n.count_nonzero( connect), '(', 100 * n.count_nonzero(connect) / C.shape[0], '% )' else: print 'The neighbors graph is highly disconnected, increase K or Radius parameters' sys.exit(2) if n.count_nonzero(connect) < data.shape[0]: data_connect_file = open('connected_data_{0}.dat'.format(filename), "w") for i in range(0, C.shape[0]): if connect[i] == 1: if MSA: data_connect_file.write(labels[i]) data_connect_file.write(data_line[i]) data_connect_file.close() indices = n.nonzero(n.triu(C, 1)) dist_list = n.asarray(C[indices])[-1] h = n.histogram(dist_list, n_bins) dx = h[1][1] - h[1][0] plt.figure(1) plt.plot(h[1][0:n_bins] + dx / 2, h[0], 'o-', label='histogram') plt.xlabel('r') plt.ylabel('N. counts') plt.legend() plt.savefig(filename + '_hist.png') distr_x = [] distr_y = [] avg = n.mean(dist_list) std = n.std(dist_list) if rmax > 0: avg = rmax std = min(std, rmax / 2) print '\nNOTE: You fixed r_max for the initial fitting, average will have the same value' else: mm = n.argmax(h[0]) rmax = h[1][mm] + dx / 2 if args.r_min >= 0: print '\nNOTE: You fixed r_min for the initial fitting: r_min = ', args.r_min if args.r_min == -1: print '\nNOTE: You forced r_min to the standard procedure in the initial fitting' print '\nDistances Statistics:' print 'Average, standard dev., n_bin, bin_size, r_max, r_NN_max:', avg, std, n_bins, dx, rmax, radius, '\n' #1 tmp = 1000000 if (args.r_min >= 0): tmp = args.r_min elif (args.r_min == -1): tmp = rmax - std if (n.fabs(rmax - avg) > std): print 'ERROR: There is a problem with the r_max detection:' print ' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)' print ' or r_max and r_avg are too distant and you may consider to fix the first detection of r_max with option -M' print ' or to change the neighbor parameter with (-r/-k)' plt.show() sys.exit() elif (rmax <= min(radius + dx, tmp)): print 'ERROR: There is a problem with the r_max detection, it is shorter than the largest distance in the neighbors graph.' print ' You may consider to fix the first detection of r_max with option -M and/or the r_min with option -n to fix the fit range' print ' or to decrease the neighbors parameter with (-r/-k)' plt.show() sys.exit() #2 Finding actual r_max and std. dev. to define fitting interval [rmin;rM] distr_x = h[1][0:n_bins] + dx / 2 distr_y = h[0][0:n_bins] res = n.empty(25) left_distr_x = n.empty(n_bins) left_distr_y = n.empty(n_bins) left_distr_x = distr_x[n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.0)] left_distr_y = n.log(distr_y[n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.0)]) coeff = n.polyfit(left_distr_x, left_distr_y, 2, full='False') a0 = coeff[0][0] b0 = coeff[0][1] c0 = coeff[0][2] rmax = -b0 / a0 / 2.0 if (args.r_max > 0): rmax = args.r_max std = n.sqrt(-1 / a0 / 2.) left_distr_x = distr_x[n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.)] left_distr_y = n.log(distr_y[n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.)]) coeff = n.polyfit(left_distr_x, left_distr_y, 2, full='False') a = coeff[0][0] b = coeff[0][1] c = coeff[0][2] rmax_old = rmax std_old = std rmax = -b / a / 2. std = n.sqrt(-1 / a / 2.) # it was a0 rmin = max(rmax - 2 * n.sqrt(-1 / a / 2.) - dx / 2, 0.) if (args.r_min >= 0): rmin = args.r_min elif (rmin < radius and args.r_min != -1): rmin = radius print '\nWARNING: For internal consistency r_min has been fixed to the largest distance (r_NN_max) in the neighbors graph.' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM = rmax + dx / 4 if (n.fabs(rmax - rmax_old) > std_old / 4): #fit consistency check print '\nWARNING: The histogram is probably not smooth enough (you may try to change n_bin with -b), rmax is fixed to the value of first iteration\n' #print rmax,rmax_old,std/4,std_old/4 rmax = rmax_old a = a0 b = b0 c = c0 if (args.r_min >= 0): rmin = args.r_min elif (rmin < radius and args.r_min != -1): rmin = radius print '\nWARNING2: For internal consistency r_min has been fixed to the largest distance in the neighbors graph (r_NN_max).' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM = rmax + dx / 4 #2 #3 Gaussian Fitting to determine ratio R left_distr_x = distr_x[n.logical_and( n.logical_and(distr_x[:] > rmin, distr_x[:] <= rM), distr_y[:] > 0.000001)] / rmax left_distr_y = n.log(distr_y[n.logical_and( n.logical_and(distr_x[:] > rmin, distr_x[:] <= rM), distr_y[:] > 0.000001)]) - (4 * a * c - b**2) / 4. / a fit = curve_fit(func2, left_distr_x, left_distr_y) ratio = n.sqrt(fit[0][0]) y1 = func2(left_distr_x, fit[0][0]) #3 #4 Geodesics D-Hypersphere Distribution Fitting to determine Dfit fit = curve_fit(func, left_distr_x, left_distr_y) Dfit = (fit[0][0]) + 1 y2 = func(left_distr_x, fit[0][0], fit[0][1], fit[0][2]) #4 #5 Determination of Dmin D_file = open('D_residual_{0}.dat'.format(filename), "w") for D in range(1, 26): y = (func(left_distr_x, D - 1, 1, 0)) for i in range(0, len(y)): res[D - 1] = n.linalg.norm((y) - (left_distr_y)) / n.sqrt(len(y)) D_file.write("%s " % D) D_file.write("%s\n" % res[D - 1]) Dmin = n.argmax(-res) + 1 y = func(left_distr_x, Dmin - 1, fit[0][1], 0) #5 #6 Printing results print '\nFITTING PARAMETERS:' print 'rmax, std. dev., rmin', rmax, std, rmin print '\nFITTING RESULTS:' print 'R, Dfit, Dmin', ratio, Dfit, Dmin, '\n' if (Dmin == 1): print 'NOTE: Dmin = 1 could indicate that the choice of the input parameters is not optimal or simply an underestimation of a 2D manifold\n' fit_file = open('fit_{0}.dat'.format(filename), "w") for i in range(0, len(y)): fit_file.write("%s " % left_distr_x[i]) fit_file.write("%s " % ((left_distr_y[i]))) fit_file.write("%s " % ((y1[i]))) fit_file.write("%s " % ((y2[i]))) fit_file.write("%s\n" % ((y[i]))) fit_file.close() stat_file = open('statistics_{0}.dat'.format(filename), "w") statistics = str('# Npoints, rmax, standard deviation, R, D_fit, Dmin \n# \ {}, {}, {}, {}, {}, {}\n'.format(n.count_nonzero(connect), rmax, std, ratio, Dfit, Dmin)) stat_file.write("%s" % statistics) for i in range(0, len(distr_x) - 2): stat_file.write("%s " % distr_x[i]) stat_file.write("%s " % distr_y[i]) stat_file.write("%s\n" % n.log(distr_y[i])) stat_file.close() plt.figure(2) plt.plot(left_distr_x, left_distr_y, 'o-', label=str(input_f.split('.')[0])) plt.plot(left_distr_x, y1, label='Gaussian fit for R ratio') plt.plot(left_distr_x, y2, label='D-Hypersphere Fit for D_fit') plt.plot(left_distr_x, y, label='D_min-Hypersphere Distribution') plt.xlabel('r/r$_{max}$') plt.ylabel('log p(r)/p(r$_{max}$)') plt.legend(loc=4) plt.savefig(str(input_f.split('.')[0]) + '_fit.png') plt.figure(3) plt.plot(range(1, 26), res, 'o-', label=str(input_f.split('.')[0]) + ' D_min') plt.legend() plt.xlabel('D') plt.ylabel('RMDS') plt.show() plt.savefig(str(input_f.split('.')[0]) + '_Dmin.png') #6 #7 Optional: Isomap projection if args.projection: from sklearn.decomposition import KernelPCA C2 = (distance.squareform(dist_list))**2 C2 = -.5 * C2 obj_pj = KernelPCA(n_components=100, kernel="precomputed") proj = obj_pj.fit_transform(C2) n.savetxt('proj_' + str(input_f.split('.')[0]) + '.dat', proj[:, 0:Dmin]) print 'NOTE: it is important to have a smooth histogram for accurate fitting\n'