def est(self, n_pca): # dimension of the dataset N = self.aX.size() # Step 1: Create the current permuted dataset G_per = GraphSet() for i in range(N): G = copy.deepcopy(self.aX.X[i]) G.permute(self.f[i]) G_per.add(G) del (G) Mat = G_per.to_matrix_with_attr() #print(Mat) # Standardizing the features if (self.scale == True): Mat_scale = pd.DataFrame(scale(Mat), columns=Mat.columns) else: Mat_scale = Mat self.barycenter = np.mean(Mat_scale) print(self.barycenter) pca = PCA(n_components=n_pca) scores = pca.fit_transform(Mat_scale) vals = pca.explained_variance_ratio_ #scores=pca.transform(Mat_scale) vecs = pd.DataFrame(pca.components_, columns=Mat_scale.columns) #top=np.argmax(vals_k) # TO HERE #vals=(vals_k[top]/sum(vals_k)).real #vecs=vecs_k[:,[top]] del Mat, Mat_scale, G_per return (vals, vecs, scores)
def predict(self, x_new, std=False): if (not isinstance(x_new, pd.core.frame.DataFrame)): print( "The new observation should be a pandas dataframe of real values" ) self.y_vec_pred = self.model.predict(X=x_new) self.y_net_pred = GraphSet() for i in range(self.y_vec_pred.shape[0]): self.y_net_pred.add( self.give_me_a_network(geo=pd.Series( data=self.y_vec_pred[i], index=self.variables_names), n_a=self.aX.node_attr, e_a=self.aX.edge_attr, s=float(x_new.loc[i]))) if (std == True and self.model_type == 'GPR'): self.y_vec_pred, self.y_std_pred = self.model.predict( X=x_new, return_std=True) self.y_net_pred = GraphSet() for i in range(self.y_vec_pred.shape[0]): self.y_net_pred.add( self.give_me_a_network(geo=pd.Series( data=self.y_vec_pred[i], index=self.variables_names), n_a=self.aX.node_attr, e_a=self.aX.edge_attr, s=float(x_new.loc[i])))
def est(self, n_pca, k, old_pca=None): # dimension of the dataset N = self.aX.size() # Step 1: Create the current permuted dataset G_per = GraphSet() for i in range(N): G = copy.deepcopy(self.aX.X[i]) G.permute(self.f[i]) G_per.add(G) del (G) Mat = G_per.to_matrix_with_attr() # Standardizing the features if (self.scale == True): Mat_scale = pd.DataFrame(scale(Mat), columns=Mat.columns) else: Mat_scale = Mat # self.barycenter=np.mean(Mat_scale) pca = PCA(n_components=n_pca) scores = pca.fit_transform(Mat_scale) vals = pca.explained_variance_ratio_ vecs = pd.DataFrame(pca.components_, columns=Mat_scale.columns) self.pcas[k] = [pca, Mat_scale] self.barycenter = pd.Series( pca.mean_, index=Mat_scale.columns) # np.mean(Mat_scale) if (k > 0): # Compute the alignment error Mat_along_old = pd.DataFrame(old_pca.inverse_transform(scores), columns=Mat_scale.columns) for i in range(N): x_along = Mat_along_old.iloc[i, :] X_curr_pca = self.give_me_a_network(x_along, n_a=self.aX.node_attr, e_a=self.aX.edge_attr) matchID = ID(self.distance) a = matchID.align(G_per.X[i], X_curr_pca) self.pcaold_error[i, k] = a.dis() del (matchID, X_curr_pca, x_along, a) # Compute the pca error # FIT TRANSFORM THE DATA along the first pca Mat_along = pd.DataFrame(pca.inverse_transform(scores), columns=Mat_scale.columns) # PCA error: for i in range(N): x_along = Mat_along.iloc[i, :] X_curr_pca = self.give_me_a_network(x_along, n_a=self.aX.node_attr, e_a=self.aX.edge_attr) matchID = ID(self.distance) a = matchID.align(G_per.X[i], X_curr_pca) self.pca_error[i, k] = a.dis() del (matchID, X_curr_pca, x_along, a) del Mat, Mat_scale, G_per return (vals, vecs, scores, pca)
def align_G(self,*args): if(isinstance(args,Graph)): if(self.m_C==None): return args else: a=self.m_matcher.align(args,self.m_C) return a.alignedSource() if(isinstance(args,GraphSet)): if(self.m_C==None): return args else: new_a_set=GraphSet() i=0 while(i==args.size()): Gi=args.X[i] # add to the new graph set an aligned graph new_a_set.add(self.align_G(Gi)) i+=1 return new_a_set
def variance(self): if (self.aX != None and self.aX.size() != 0): if (self.var != None): return self.var else: if (not isinstance(self.mean, Graph)): self.mean = self.align_and_est() n = self.aX.size() if (self.m_dis == None): # the variance is computed as a distance between the mean and the sample align_X = GraphSet() for i in range(n): G = copy.deepcopy(self.aX.X[i]) G.permute(self.f[i]) align_X.add(G) del (G) self.m_dis = self.matcher.dis(align_X, self.mean) self.var = 0.0 for i in range(n): self.var += self.m_dis[i] self.var = self.var / n return self.var else: print("Sample of graphs is empty")
sys.path.append("C:\\Users\\Anna\\OneDrive - Politecnico di Milano\\Windows\\Polimi\\Ricerca\\Regression\\GraphSpace\\") os.chdir('C:\\Users\\Anna\\OneDrive - Politecnico di Milano\\Windows\\Polimi\\Ricerca\\Regression\\Simulations\\DataSets') from core import Graph from core import GraphSet import matplotlib.pyplot as plt import networkx as nx import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.cm as cm # Modello regression from sklearn import linear_model,gaussian_process import numpy as np import random random.seed(3) G=GraphSet(graph_type='undirected') G.read_from_text("C:\\Users\\Anna\\OneDrive - Politecnico di Milano\\Windows\\Polimi\\Ricerca\\Regression\\Simulations\\DataSets\\GraphSet_CryptCorrMats.txt") # plot the true and the predicted G_origin=r.y_net_pred.X[0] # Network plot # Go to networkx format G_plot=G_origin.to_networkX(layer=0,node_too=True) # Define the nodes positions pos={0: [-0.16210871, 0.92931688], 1: [0.36616978, 1. ], 2: [ 0.48415449, -0.48927315]} # or with networkx.layout https://networkx.github.io/documentation/stable/reference/drawing.html # Initialize the colors as egdes weights colors=list(nx.get_edge_attributes(G_plot,'weight').values())
def align_and_est(self): # INITIALIZATION: # Select a Random Candidate: first_id = random.randint(0, self.aX.size() - 1) m_1 = self.aX.X[first_id] while (m_1.n_nodes == 1): first_id = random.randint(0, self.aX.size() - 1) m_1 = self.aX.X[first_id] # Sequential version: # Align all the points wrt the random candidate #for i in range(self.X.size()): # # Align X to Y # a = self.matcher.dis(self.aX.X[i],m_1) # # Permutation of X to go closer to Y # self.f[i] = range(0,9)#self.matcher.f # Parallel Version; Parallel(n_jobs=10, require='sharedmem')( delayed(self.two_net_match)(m_1, i, first_id) for i in range(self.aX.size())) # Compute the first Generalized Geodesic Regression line E_1 = self.est(k=0) # Align the set wrt the geodesic Parallel(n_jobs=10, require='sharedmem')(delayed(self.align_pred)(E_1[1], i, 0) for i in range(self.aX.size())) # AAC iterative algorithm for k in range(1, self.nr_iterations): # Compute the first Generalized Geodesic Regression line E_2 = self.est(k) # Align the set wrt the geodesic Parallel(n_jobs=6, require='sharedmem')( delayed(self.align_pred)(E_2[1], i, k) for i in range(self.aX.size())) #sequential version: self.align_pred(E_2[1],k) # Compute the step: the algorithmic step is computed as the square difference between the coefficients step_range = abs( sum([ self.regression_error[i, k - 1] for i in range(0, self.aX.size()) ]) - sum([ self.regression_error[i, k] for i in range(0, self.aX.size()) ])) #self.error+=[self.regression_error.iloc[:,k].sum()] if (step_range < 0.05): self.model = E_2[0] if (self.model_type == 'OLS'): # Return the coefficients self.network_coef = GraphSet() # self.vector_coef = pd.Series(data=E_2[0].coef_.flatten(), index=self.variables_names) self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].intercept_.flatten(), index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s='Intercept')) for i_th in range(E_2[0].coef_.shape[1]): self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].coef_[:, i_th], index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s=str('beta' + str(i_th)))) self.regression_error = pd.DataFrame.from_dict({ iteration: [ self.regression_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(k + 1) }) self.postalignment_error = pd.DataFrame.from_dict({ iteration: [ self.postalignment_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(k + 1) }) self.nr_iterations = k print("Step Range smaller than 0.005") return #else Go on with the computation: update the new result and restart from step 1. del E_1 E_1 = E_2 del E_2 print("Maximum number of iteration reached.") # Return the result if ('E_2' in locals()): self.model = E_2[0] if (self.model_type == 'OLS'): # Return the coefficients self.network_coef = GraphSet() #self.vector_coef = pd.Series(data=E_2[0].coef_.flatten(), index=self.variables_names) self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].intercept_.flatten(), index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s='Intercept')) for i_th in range(E_2[0].coef_.shape[1]): self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].coef_[:, i_th], index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s=str('beta' + str(i_th)))) self.regression_error = pd.DataFrame.from_dict({ iteration: [ self.regression_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) self.postalignment_error = pd.DataFrame.from_dict({ iteration: [ self.postalignment_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) else: # Return the prior and the posterior # ATTENTION: CHECK ON THE PRIOR WITH AASA self.y_post = E_2[1] self.y_post_std = E_2[2] del E_2, E_1 else: self.model = E_1[0] if (self.model_type == 'OLS'): # Return the coefficients self.network_coef = GraphSet() #self.vector_coef = pd.Series(data=E_2[0].coef_.flatten(), index=self.variables_names) self.network_coef.add( self.give_me_a_network(pd.Series( data=E_1[0].intercept_.flatten(), index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s='Intercept')) for i_th in range(E_1[0].coef_.shape[1]): self.network_coef.add( self.give_me_a_network(pd.Series( data=E_1[0].coef_[:, i_th], index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s=str('beta' + str(i_th)))) self.regression_error = pd.DataFrame.from_dict({ iteration: [ self.regression_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) self.postalignment_error = pd.DataFrame.from_dict({ iteration: [ self.postalignment_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) else: # Return the prior and the posterior # ATTENTION: CHECK ON THE PRIOR WITH AASA self.y_post = E_1[1] self.y_post_std = E_1[2] del E_1
def est(self, k): # Step 1: Create the current permuted dataset self.f_iteration[k] = self.f G_per = GraphSet() for i in range(self.aX.size()): G_temp = copy.deepcopy(self.aX.X[i]) G_temp.permute(self.f[i]) G_temp.s = copy.deepcopy(self.aX.X[i].s) G_per.add(G_temp) del (G_temp) del (self.aX) self.aX = copy.deepcopy(G_per) # Step 2: Transform it into a matrix y = G_per.to_matrix_with_attr() # parameter saved: self.variables_names = y.columns # Step 3: create the x vector # Create the input value x = pd.DataFrame(columns=range(len(G_per.X[0].s)), index=range(y.shape[0])) for i in range(y.shape[0]): x.iloc[i] = [float(regressor) for regressor in G_per.X[i].s] self.regressor = x # Step 4: fit the chosen regression model # Ordinary Least Square if (self.model_type == 'OLS'): # Create linear regression object model = linear_model.LinearRegression() model.fit(x, y) along_geo_pred = pd.DataFrame(model.predict(x), columns=self.variables_names) self.f_all[k] = self.f #self.regression_error.iloc[:, k] = (along_geo_pred - y).pow(2).sum(axis=1) return (model, along_geo_pred) # Gaussian Process elif (self.model_type == 'GPR'): along_geo_pred = pd.DataFrame(index=range(y.shape[0]), columns=self.variables_names) along_geo_pred_sd = pd.DataFrame(index=range(y.shape[0]), columns=self.variables_names) # list in which we save the temporary regression error regression_error_temp = [] # We are fitting a different Gaussian process for every variable (i.e. for every node or edge) for m in range(len(self.variables_names)): # Inizialize the gaussian process model = gaussian_process.GaussianProcessRegressor( kernel=self.kernel, n_restarts_optimizer=self.restarts, alpha=self.alpha) # Fitting the Gaussian Process means finding the correct hyperparameters model.fit(x, y.iloc[:, m]) # Saving the model self.models[self.variables_names[m]] = model # Predict to compute the regression error (to compare with the alignment error) y_pred, y_std = model.predict(x, return_std=True) # save both the predicted y and the std, to estimate the posterior along_geo_pred.loc[:, self.variables_names[m]] = pd.Series(y_pred) along_geo_pred_sd.loc[:, self.variables_names[m]] = pd.Series( y_std) # Compute the error # HERE! YOU CAN SUBSTITUTE IT WITH AN ERROR FUNCTION err_euclidean = (y_tr.iloc[:, 2] - y_pred).pow(2) err_weighted = [ err_euclidean[i] / y_std[i] for i in range(len(y_std)) ] self.regression_error.iloc[:, k] += err_weighted return (model, along_geo_pred, y_std) else: raise Exception("Wrong regression model: select either OLS or GPR")
class ggr_aac(aligncompute): def __init__(self, graphset, matcher, distance, regression_model='OLS', nr_iterations=100, alpha=1e-10, kernel=None, restarts=0): # distance and matcher used to compute the alignment aligncompute.__init__(self, graphset, matcher) # distance used to compute the regression error self.distance = distance # nr of iteration of the algorithm self.nr_iterations = nr_iterations # indicate which type of regression model: # OLS (e.g. network on scalar regression problems) # GPR (e.g. network on time regression problems) self.model_type = regression_model if (self.model_type == 'GPR'): self.alpha = alpha self.restarts = restarts self.models = {} if (kernel == None): # by deafault we select an exponential kernel # See kernel section in gaussian_process documentation # https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels # Here we used: 1/2exp(-d(x1/l,x2/l)^2) # - s is the parameter of the ConstantKernel # - l is the parameter of the RBF (radial basis function) kernel self.kernel = gaussian_process.kernels.ConstantKernel( 1.0) * gaussian_process.kernels.RBF(1.0) else: self.kernel = kernel # Regression error for each iteration and each observation self.regression_error = { } #pd.DataFrame(0,index=range(graphset.size()), columns=range(self.nr_iterations)) self.postalignment_error = { } #pd.DataFrame(0,index=range(graphset.size()), columns=range(self.nr_iterations)) self.f_iteration = {} self.f_all = {} def align_and_est(self): # INITIALIZATION: # Select a Random Candidate: first_id = random.randint(0, self.aX.size() - 1) m_1 = self.aX.X[first_id] while (m_1.n_nodes == 1): first_id = random.randint(0, self.aX.size() - 1) m_1 = self.aX.X[first_id] # Sequential version: # Align all the points wrt the random candidate #for i in range(self.X.size()): # # Align X to Y # a = self.matcher.dis(self.aX.X[i],m_1) # # Permutation of X to go closer to Y # self.f[i] = range(0,9)#self.matcher.f # Parallel Version; Parallel(n_jobs=10, require='sharedmem')( delayed(self.two_net_match)(m_1, i, first_id) for i in range(self.aX.size())) # Compute the first Generalized Geodesic Regression line E_1 = self.est(k=0) # Align the set wrt the geodesic Parallel(n_jobs=10, require='sharedmem')(delayed(self.align_pred)(E_1[1], i, 0) for i in range(self.aX.size())) # AAC iterative algorithm for k in range(1, self.nr_iterations): # Compute the first Generalized Geodesic Regression line E_2 = self.est(k) # Align the set wrt the geodesic Parallel(n_jobs=6, require='sharedmem')( delayed(self.align_pred)(E_2[1], i, k) for i in range(self.aX.size())) #sequential version: self.align_pred(E_2[1],k) # Compute the step: the algorithmic step is computed as the square difference between the coefficients step_range = abs( sum([ self.regression_error[i, k - 1] for i in range(0, self.aX.size()) ]) - sum([ self.regression_error[i, k] for i in range(0, self.aX.size()) ])) #self.error+=[self.regression_error.iloc[:,k].sum()] if (step_range < 0.05): self.model = E_2[0] if (self.model_type == 'OLS'): # Return the coefficients self.network_coef = GraphSet() # self.vector_coef = pd.Series(data=E_2[0].coef_.flatten(), index=self.variables_names) self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].intercept_.flatten(), index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s='Intercept')) for i_th in range(E_2[0].coef_.shape[1]): self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].coef_[:, i_th], index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s=str('beta' + str(i_th)))) self.regression_error = pd.DataFrame.from_dict({ iteration: [ self.regression_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(k + 1) }) self.postalignment_error = pd.DataFrame.from_dict({ iteration: [ self.postalignment_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(k + 1) }) self.nr_iterations = k print("Step Range smaller than 0.005") return #else Go on with the computation: update the new result and restart from step 1. del E_1 E_1 = E_2 del E_2 print("Maximum number of iteration reached.") # Return the result if ('E_2' in locals()): self.model = E_2[0] if (self.model_type == 'OLS'): # Return the coefficients self.network_coef = GraphSet() #self.vector_coef = pd.Series(data=E_2[0].coef_.flatten(), index=self.variables_names) self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].intercept_.flatten(), index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s='Intercept')) for i_th in range(E_2[0].coef_.shape[1]): self.network_coef.add( self.give_me_a_network(pd.Series( data=E_2[0].coef_[:, i_th], index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s=str('beta' + str(i_th)))) self.regression_error = pd.DataFrame.from_dict({ iteration: [ self.regression_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) self.postalignment_error = pd.DataFrame.from_dict({ iteration: [ self.postalignment_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) else: # Return the prior and the posterior # ATTENTION: CHECK ON THE PRIOR WITH AASA self.y_post = E_2[1] self.y_post_std = E_2[2] del E_2, E_1 else: self.model = E_1[0] if (self.model_type == 'OLS'): # Return the coefficients self.network_coef = GraphSet() #self.vector_coef = pd.Series(data=E_2[0].coef_.flatten(), index=self.variables_names) self.network_coef.add( self.give_me_a_network(pd.Series( data=E_1[0].intercept_.flatten(), index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s='Intercept')) for i_th in range(E_1[0].coef_.shape[1]): self.network_coef.add( self.give_me_a_network(pd.Series( data=E_1[0].coef_[:, i_th], index=self.variables_names), self.aX.node_attr, self.aX.edge_attr, s=str('beta' + str(i_th)))) self.regression_error = pd.DataFrame.from_dict({ iteration: [ self.regression_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) self.postalignment_error = pd.DataFrame.from_dict({ iteration: [ self.postalignment_error[observation, iteration] for observation in range(self.aX.size()) ] for iteration in range(self.nr_iterations) }) else: # Return the prior and the posterior # ATTENTION: CHECK ON THE PRIOR WITH AASA self.y_post = E_1[1] self.y_post_std = E_1[2] del E_1 # Align wrt a geodesic def align_pred(self, y_pred, i, k): #self.f.clear() # the alignment wrt a geodesic aiming at predicting data is an alignment wrt the prediction along # the regression gamma(x_i) and the data point itself y_i # i.e. find the optimal candidate y* in [y] st d(gamma(x)-y) is minimum self.aX.get_node_attr() self.aX.get_edge_attr() # Sequential Version: # for every graph save the new alignment # for i in range(self.aX.size()): # # transform the estimation into a network to compute the networks distances # y_pred_net= self.give_me_a_network(y_pred.iloc[i], self.aX.node_attr, self.aX.edge_attr) # # Regression error: # match=ID(self.distance) # self.regression_error.iloc[i,k]=match.dis(self.aX.X[i],y_pred_net) # # sum of squares of distances # self.postalignment_error.iloc[i,k]=self.matcher.dis(self.aX.X[i],y_pred_net) # self.f[i] = self.matcher.f # del(y_pred_net,match) # Parallel Version: see the function at the end of the code # transform the estimation into a network to compute the networks distances y_pred_net = self.give_me_a_network(y_pred.iloc[i], self.aX.node_attr, self.aX.edge_attr) # Regression error: match = ID(self.distance) self.regression_error[i, k] = match.dis(self.aX.X[i], y_pred_net) self.postalignment_error[i, k] = self.matcher.dis(self.aX.X[i], y_pred_net) self.f[i] = self.matcher.f del (y_pred_net, match) # Compute the generalized geodesic regression on the total space as a regression of the aligned graph set def est(self, k): # Step 1: Create the current permuted dataset self.f_iteration[k] = self.f G_per = GraphSet() for i in range(self.aX.size()): G_temp = copy.deepcopy(self.aX.X[i]) G_temp.permute(self.f[i]) G_temp.s = copy.deepcopy(self.aX.X[i].s) G_per.add(G_temp) del (G_temp) del (self.aX) self.aX = copy.deepcopy(G_per) # Step 2: Transform it into a matrix y = G_per.to_matrix_with_attr() # parameter saved: self.variables_names = y.columns # Step 3: create the x vector # Create the input value x = pd.DataFrame(columns=range(len(G_per.X[0].s)), index=range(y.shape[0])) for i in range(y.shape[0]): x.iloc[i] = [float(regressor) for regressor in G_per.X[i].s] self.regressor = x # Step 4: fit the chosen regression model # Ordinary Least Square if (self.model_type == 'OLS'): # Create linear regression object model = linear_model.LinearRegression() model.fit(x, y) along_geo_pred = pd.DataFrame(model.predict(x), columns=self.variables_names) self.f_all[k] = self.f #self.regression_error.iloc[:, k] = (along_geo_pred - y).pow(2).sum(axis=1) return (model, along_geo_pred) # Gaussian Process elif (self.model_type == 'GPR'): along_geo_pred = pd.DataFrame(index=range(y.shape[0]), columns=self.variables_names) along_geo_pred_sd = pd.DataFrame(index=range(y.shape[0]), columns=self.variables_names) # list in which we save the temporary regression error regression_error_temp = [] # We are fitting a different Gaussian process for every variable (i.e. for every node or edge) for m in range(len(self.variables_names)): # Inizialize the gaussian process model = gaussian_process.GaussianProcessRegressor( kernel=self.kernel, n_restarts_optimizer=self.restarts, alpha=self.alpha) # Fitting the Gaussian Process means finding the correct hyperparameters model.fit(x, y.iloc[:, m]) # Saving the model self.models[self.variables_names[m]] = model # Predict to compute the regression error (to compare with the alignment error) y_pred, y_std = model.predict(x, return_std=True) # save both the predicted y and the std, to estimate the posterior along_geo_pred.loc[:, self.variables_names[m]] = pd.Series(y_pred) along_geo_pred_sd.loc[:, self.variables_names[m]] = pd.Series( y_std) # Compute the error # HERE! YOU CAN SUBSTITUTE IT WITH AN ERROR FUNCTION err_euclidean = (y_tr.iloc[:, 2] - y_pred).pow(2) err_weighted = [ err_euclidean[i] / y_std[i] for i in range(len(y_std)) ] self.regression_error.iloc[:, k] += err_weighted return (model, along_geo_pred, y_std) else: raise Exception("Wrong regression model: select either OLS or GPR") # Given x_new is predicting the corresponding graph: def predict(self, x_new, std=False): if (not isinstance(x_new, pd.core.frame.DataFrame)): print( "The new observation should be a pandas dataframe of real values" ) self.y_vec_pred = self.model.predict(X=x_new) self.y_net_pred = GraphSet() for i in range(self.y_vec_pred.shape[0]): self.y_net_pred.add( self.give_me_a_network(geo=pd.Series( data=self.y_vec_pred[i], index=self.variables_names), n_a=self.aX.node_attr, e_a=self.aX.edge_attr, s=float(x_new.loc[i]))) if (std == True and self.model_type == 'GPR'): self.y_vec_pred, self.y_std_pred = self.model.predict( X=x_new, return_std=True) self.y_net_pred = GraphSet() for i in range(self.y_vec_pred.shape[0]): self.y_net_pred.add( self.give_me_a_network(geo=pd.Series( data=self.y_vec_pred[i], index=self.variables_names), n_a=self.aX.node_attr, e_a=self.aX.edge_attr, s=float(x_new.loc[i]))) # These functions are auxiliary function to compute the ggr # geo is a pd Series # n_a and e_a are nodes and edges attributes def give_me_a_network(self, geo, n_a, e_a, s=None): ind = [re.findall(r'-?\d+\.?\d*', k) for k in geo.axes[0]] x_g = {} for i in range(len(ind)): if (len(ind[i]) > 2 and int(ind[i][0]) == int(ind[i][1]) and not (int(ind[i][0]), int(ind[i][1])) in x_g): x_g[int(ind[i][0]), int(ind[i][1])] = [ geo.loc[geo.axes[0][i + j]] for j in range(n_a) ] elif (len(ind[i]) > 2 and int(ind[i][0]) != int(ind[i][1]) and not (int(ind[i][0]), int(ind[i][1])) in x_g): x_g[int(ind[i][0]), int(ind[i][1])] = [ geo.loc[geo.axes[0][i + j]] for j in range(e_a) ] elif (len(ind[i]) == 2 and not (int(ind[i][0]), int(ind[i][1])) in x_g): x_g[int(ind[i][0]), int(ind[i][1])] = [geo.loc[geo.axes[0][i]]] geo_net = Graph(x=x_g, adj=None, s=s) return geo_net # Conformal prediction def align_est_and_predRegions( self, alpha, ): # Divide training and test # save the training in aX # X.s you can find the regressors # self.est and self.align_pred are the two function for the estimation of the coefficients # you can extract the coefficient as self.network_coef (graphset) # you can extract the s return 0 # This function is used to parallelized the alignment procedure # receive two graphs, a matcher, an f where you are willing to save the permutations and gives back the # optimal permutation def two_net_match(self, X2, i, first_id): if (i == first_id): self.f[first_id] = range(self.aX.n_nodes) # Align X to Y else: self.matcher.the_dis(self.aX.X[i], X2) # Permutation of X to go closer to Y self.f[i] = self.matcher.f
def align_and_est(self, max_iterations=200, eps=0.001): # Select a Random Candidate: first_id = random.randint(0, self.aX.size() - 1) # first_id = 318 m_1 = self.aX.X[first_id] self.f[first_id] = range(self.X.n_nodes) # k=200 maximum number of iteration for self.k in range(max_iterations): print("\n start of iteration: " + str(self.k)) for i in range(self.X.size()): # print('\t already matched: ' + str(i)) # Align X to Y a = self.matcher.align(self.aX.X[i], m_1) # Permutation of X to go closer to Y self.f[i] = a.f # self.aX.X[i]=a.alignedSource() # print m_1.x # print a.alignedSource().x m_2 = self.est(m_1) step_range = self.matcher.dis(m_1, m_2) if (step_range < eps): self.mean = m_2 # Update aX with the final permutations: Aligned = GraphSet() # Aligned.add(self.aX.X[0]) for i in range(self.X.size()): G = self.aX.X[i] G.permute(self.f[i]) Aligned.add(G) del G self.aX = copy.deepcopy(Aligned) del Aligned print("Step Range smaller than 0.001") return else: del m_1 m_1 = m_2 del m_2 # check here self.f.clear() print("Maximum number of iteration reached.") if ('m_2' in locals()): self.mean = m_2 # Update aX with the final permutations: Aligned = GraphSet() Aligned.add(self.aX.X[0]) for i in range(self.X.size()): G = self.aX.X[i] G.permute(self.f[i]) Aligned.add(G) del G self.aX = copy.deepcopy(Aligned) del Aligned del m_2, m_1 else: self.mean = m_1 # Update aX with the final permutations: Aligned = GraphSet() Aligned.add(self.aX.X[0]) for i in range(1, self.X.size()): G = self.aX.X[i] G.permute(self.f[i]) Aligned.add(G) del G self.aX = copy.deepcopy(Aligned) del Aligned del m_1
def align_and_est(self, n_comp, scale, s): # If True scaling is applied to the GraphSet self.scale = scale # Range for the alignment wrt a geodesic self.s_min = s[0] self.s_max = s[1] # k=100 maximum number of iteration for k in range(100): # STEP 0: Align wrt an randomly selected observation, Compute the first pca if (k == 0): self.f[0] = list(range(self.aX.n_nodes)) # PREVIOUS: m_1 = self.aX.X[0] # Align wrt one of the minimum size random element #size_obs = {i: len(self.aX.X[i].adj.keys()) for i in range(self.aX.size())} #min_size = min(size_obs.values()) #id_min_size=[i for i, v in size_obs.items() if v == min_size] #m_1=self.aX.X[id_min_size[0]] for i in range(1, self.aX.size()): # Align X to Y a = self.matcher.align(self.aX.X[i], m_1) # Permutation of X to go closer to Y self.f[i] = a.f # Compute the first Principal Component in the first step E_1 = self.est(n_comp) continue #return E1 # STEP 1: Align wrt the first principal component self.align_geo(E_1[1].loc[0, :]) # STEP 2: Compute the principal component if (k > 0): E_2 = self.est(n_comp) # STEP 3: Step range is the difference between the eigenvalues step_range = distance = math.sqrt( sum([(a - b)**2 for a, b in zip(E_2[0], E_1[0])])) if (step_range < 0.01): # IF small enough, I am converging! Save and exit. self.e_val = E_2[0] self.scores = E_2[2] if (n_comp == 1): self.e_vec = self.give_me_a_network(E_2[1].loc[0, :], n_a=self.aX.node_attr, e_a=self.aX.edge_attr) self.barycenter_net = self.give_me_a_network( self.barycenter, n_a=self.aX.node_attr, e_a=self.aX.edge_attr) else: G_PCA = GraphSet() for n_pca in range(n_comp): G_PCA.add( self.give_me_a_network(E_2[1].loc[n_pca, :], n_a=self.aX.node_attr, e_a=self.aX.edge_attr)) self.e_vec = G_PCA self.barycenter_net = self.give_me_a_network( self.barycenter, n_a=self.aX.node_attr, e_a=self.aX.edge_attr) print("Step Range smaller than 0.001") return else: # Go on with the computation: update the new result and restart from step 1. del E_1 E_1 = E_2 del E_2 print("Maximum number of iteration reached.") # Return the result if ('E_2' in locals()): self.e_val = E_2[0] self.scores = E_2[2] self.barycenter_net = self.give_me_a_network(self.barycenter, n_a=self.aX.node_attr, e_a=self.aX.edge_attr) if (n_comp == 1): self.e_vec = self.give_me_a_network(E_2[1].loc[0, :], n_a=self.aX.node_attr, e_a=self.aX.edge_attr) else: G_PCA = GraphSet() for n_pca in range(n_comp): G_PCA.add( self.give_me_a_network(E_2[1].loc[n_pca, :], n_a=self.aX.node_attr, e_a=self.aX.edge_attr)) self.e_vec = G_PCA del G_PCA del E_2 else: self.e_val = E_1[0] self.scores = E_1[2] self.barycenter_net = self.give_me_a_network(self.barycenter, n_a=self.aX.node_attr, e_a=self.aX.edge_attr) if (n_comp == 1): self.e_vec = self.give_me_a_network(E_1[1].loc[0, :], n_a=self.aX.node_attr, e_a=self.aX.edge_attr) else: G_PCA = GraphSet() for n_pca in range(n_comp): G_PCA.add( self.give_me_a_network(E_1[1].loc[n_pca, :], n_a=self.aX.node_attr, e_a=self.aX.edge_attr)) self.e_vec = G_PCA del G_PCA del E_1
x1[5, 2] = [1] x2 = {} x2[0, 0] = [1] x2[1, 1] = [1] x2[2, 2] = [1] x2[3, 3] = [1] x2[4, 4] = [1] x2[5, 5] = [1] x2[0, 1] = [1] x2[1, 0] = [1] x2[1, 2] = [1] x2[2, 1] = [1] x2[3, 4] = [1] x2[4, 3] = [1] # Create Graph set: G = GraphSet(graph_type='directed') G.add(Graph(x=x1, s=[1, 2], adj=None)) G.add(Graph(x=x2, s=[2, 3], adj=None)) # Compute a distance with euclidean distance without matching the graphs match = ID(hamming()) match.dis(G.X[0], G.X[1]) # 2) GRAPHS with Euclidean scalar and vector attributes on both nodes and edges # Define the graphs: x1 = {} x1[0, 0] = [0.813, 0.630] x1[1, 1] = [1.606, 2.488] x1[2, 2] = [2.300, 0.710] x1[3, 3] = [0.950, 1.616] x1[4, 4] = [2.046, 1.560]