def predict_rescal_als(T, idx): A, R, _, _, _ = rescal_als(T, 10, maxIter=10, lambda_A=10, lambda_R=10, compute_fit=False) n = A.shape[0] P = np.dot(A, np.dot(R[-1], A[idx, :].T)) nrm = np.linalg.norm(P) if nrm != 0: P = np.round_(P/nrm, decimals=3) return P
def tensor_factorization(lang, r, n_iter): _log.info("start factorization") X = pkl_utils._load(config.TENSOR[lang]) _log.info("data loading complete") A, R, _, _, _ = rescal_als(X, r, maxIter=n_iter, lambda_A=10, lambda_R=10, compute_fit=False) data_output = {'A':A, 'R':R} pkl_utils._save(config.RESCAL_OUTPUT[lang], data_output) _log.info("factorization complete")
def get_embeddings(Tenc): feature_vec,R, fit, itr, exectimes = rescal_als(Tenc,250, init='nvecs', conv=1e-2, lambda_A=0.1, lambda_R= 0.1 ) return feature_vec
def predict_rescal_als(T): A, R, _, _, _ = rescal_als( T, 100, init='nvecs', conv=1e-3, lambda_A=10, lambda_R=10 ) n = A.shape[0] P = zeros((n, n, len(R))) for k in range(len(R)): P[:, :, k] = dot(A, dot(R[k], A.T)) return P
def predict_rescal_als(T): A, R, f, itr, exectimes = rescal_als(T, 10, init='nvecs', lambda_A=10, lambda_R=10) n = A.shape[0] P = zeros((n, n, len(R))) for k in range(len(R)): P[:, :, k] = dot(A, dot(R[k], A.T)) return A, P, R
def predict_rescal_als(T): A, R, _, _, _ = rescal_als(T, 100, init='nvecs', conv=1e-3, lambda_A=10, lambda_R=10) n = A.shape[0] P = zeros((n, n, len(R))) for k in range(len(R)): P[:, :, k] = dot(A, dot(R[k], A.T)) return P
def predict_rescal_als(T, rank, lambda_A, lambda_R): A, R, _, _, _ = rescal_als(T, rank, init='nvecs', conv=1e-4, lambda_A=lambda_A, lambda_R=lambda_R, compute_fit=True) n = A.shape[0] P = zeros((n, n, len(R))) for k in range(len(R)): P[:, :, k] = dot(A, dot(R[k], A.T)) return P
def predict_rescal_als(T, rank=100, lambda_A=10, lambda_R=10): ''' T is a list of sparse 2d lil_matrices. ''' A, R, _, _, _ = rescal_als(T, rank, init='nvecs', conv=1e-3, lambda_A=lambda_A, lambda_R=lambda_R) n = A.shape[0] P = np.zeros((n, n, len(R))) for k in range(len(R)): P[:, :, k] = np.dot(A, np.dot(R[k], A.T)) return P
def fit(self): data = self.frontend.data_ma K = self.expe.K data = [sp.sparse.csr_matrix(data)] A, R, fit, itr, exectimes = rescal_als(data, K, init='nvecs', lambda_A=10, lambda_R=10, maxIter=self.iterations) self.log.info('Rescal fit info : ') print('fit: %s; itr: %s, exectimes: %s' % (fit, itr, exectimes)) self._theta = A self._phi = R
def rescal(X, K): ## Load Matlab data and convert it to dense tensor format #T = loadmat('data/alyawarra.mat')['Rs'] #X = [lil_matrix(T[:, :, k]) for k in range(T.shape[2])] X = [sp.sparse.csr_matrix(X)] A, R, fit, itr, exectimes = rescal_als(X, K, init='nvecs', lambda_A=10, lambda_R=10) theta = A.dot(R).dot(A.T) Y = 1 / (1 + np.exp(-theta)) Y = Y[:, 0, :] Y[Y <= 0.5] = 0 Y[Y > 0.5] = 1 #Y = sp.stats.bernoulli.rvs(Y) return Y
def fit(self, frontend): self._init(frontend) K = self.expe.K y = frontend.adj() data = [y] A, R, fit, itr, exectimes = rescal_als(data, K, init='nvecs', lambda_A=10, lambda_R=10) self._theta = A self._phi = R[0] self.log.info('rescal fit info: %s; itr: %s, exectimes: %s' % (fit, itr, exectimes)) self.compute_measures() if self.expe.get('_write'): self.write_current_state(self)
def execute_rescal(input_tensor, rank, useNeedTypeSlice=True, useConnectionSlice=True, init='nvecs', conv=1e-4, lambda_A=0, lambda_R=0, lambda_V=0): temp_tensor = input_tensor.getSliceMatrixList() if not (useNeedTypeSlice): _log.info('Do not use needtype slice for RESCAL') del temp_tensor[SparseTensor.NEED_TYPE_SLICE] if not (useConnectionSlice): _log.info('Do not use connection slice for RESCAL') del temp_tensor[SparseTensor.CONNECTION_SLICE] _log.info('start rescal processing ...') _log.info('config: init=%s, conv=%f, lambda_A=%f, lambda_R=%f, lambda_V=%f' % (init, conv, lambda_A, lambda_R, lambda_V)) _log.info('Datasize: %d x %d x %d | Rank: %d' % ( temp_tensor[0].shape + (len(temp_tensor),) + (rank,)) ) A, R, _, _, _ = rescal_als( temp_tensor, rank, init=init, conv=conv, lambda_A=lambda_A, lambda_R=lambda_R, lambda_V=lambda_V, compute_fit='true' ) _log.info('rescal stopped processing') return A, R
def netCreate(X, r, minEdges, sampleMethod='deterministic', rescal_lambda_A=10, rescal_lambda_R=10, plotting=True, layout='spring', graphScale=1.0): """Wrapper for sktensor.rescal_als. Create network by given sampleMethod from hierarchical clustering of RESCALA_ALS tensor factorization of singular values matrix A(A^T); Input: X is list of sktensor.csr_matrices [X[k] for k in relationships], each X_k is frontal slide of adjacency tensor (ie, adjacency matrix of one relationship type); Return: {'cluster':hm, 'graph':g, 'linkage':hm['linkage'], 'theta':theta, 'A':A, 'Z':Z} """ #import logging #from rescal import rescal_als #import numpy as np #import pandas as pd #import networkx as nx #import matplotlib.pyplot as plt #from scipy.spatial.distance import pdist, squareform # Set logging to INFO to see RESCAL information logging.basicConfig(level=logging.INFO) # Decompose tensor using RESCAL-ALS A, R, fit, itr, exectimes = rescal_als(X, r, init='nvecs', lambda_A=rescal_lambda_A, lambda_R=rescal_lambda_R) # make the AAT matrix AAT = np.dot(A, A.T) # heatmap hm = heatmap(AAT) plt.suptitle(r'A(A^T) HAC for Induced Rank = %s, $\lambda_{A}$ = %s, $\lambda_{R}$ = %s '%(r,rescal_lambda_A,rescal_lambda_R), fontweight='bold', fontsize=14) # remove zeros AATnn = AAT AATnn[AATnn < 0] = 0 # remove upper triangle AATnn = np.tril(AATnn, k= -1) # k = -1 to keep only below diagonal # reproducibility np.random.seed(1) # network sampling if sampleMethod == 'Bernoulli': # shortcut: normalize by largest AAT (non-negative) value for separate bernoulli draws # instead of summing over all AAT for multinomial draw, which is harder to # transfer back and forth between vector and triangular matrix theta = AATnn / AATnn.max() # NETWORK SAMPLE ALGORITHM: # random sample ties in network adjacency matrix # one-element-at-a-time Bernoulli shortcut # instead of multinomial sample of entire adjacency n = np.shape(theta)[0] m = np.shape(theta)[1] Z = np.zeros((n,m)) # use dependent row,col permutations to randomly select # elements ij to sample after first full pass through matrix while np.sum(Z) < minEdges: shuffledRows = np.arange(1,n) #up to n rows np.random.shuffle(shuffledRows) # first shuffle rows for i in shuffledRows: # for given row shuffle use lower triangle columns j in that row i shuffledCols = np.arange(i) #up to (i-1) cols, ie, lower triangle np.random.shuffle(shuffledCols) for j in shuffledCols: if Z[i,j] < 1: Z[i,j] = np.random.binomial(n=1, p=theta[i,j], size=1) if np.sum(Z) >= minEdges: break if np.sum(Z) >= minEdges: break elif sampleMethod == 'multinomial': # NETWORK SAMPLING ALGORITHM: # problem: doesn't sufficiently cluster the resulting network draws = int(np.ceil(minEdges*1.2)) dist = pdist(A) # what matrix to use: pdist(A) or just tril(AAT) directly? invdist = dist invdist[invdist != 0] = 1/invdist[invdist!=0] # prevent division by 0 thetavec = invdist / np.sum(invdist) theta = squareform(thetavec) # multinomial sample n = np.shape(theta)[0] Z = np.zeros((n,n)) samp = sampleLinks(q=thetavec, edgesToDraw=1, draws=draws) while np.sum(samp) < minEdges: draws = int(np.ceil(draws * 1.1)) #increase number of draws and try again samp = sampleLinks(q=thetavec,edgesToDraw=1,draws=draws) Z[np.tril_indices_from(Z, k =-1)] = samp elif sampleMethod == 'deterministic': theta = AATnn / AATnn.max() n = np.shape(AATnn)[0] sv = AATnn[np.tril_indices_from(AATnn, k =-1)] #pull singular values from triangle cutOff = topNEdges(data = sv, minEdges = minEdges, n = n)['cutOff'] Z = np.zeros((n,n)) Z[np.where(AATnn >= cutOff)] = 1 else: print('No valid sampleMethod selected. Please choose "Bernoulli", "multinomial", or "deterministic" .') # NETWORK # Create networkx graph from Z g = nx.Graph() #add nodes with colors of group for n in np.arange(np.shape(hm['corder'])[0]-1): g.add_node(hm['corder'][n],color=hm['group'][n]) nodeColorList = list(nx.get_node_attributes(g,'color').values()) #add edges with weight of theta (probability the link exists) cardE = len(np.where(Z==1)[1]) edgeList = [(np.where(Z==1)[0][i], np.where(Z==1)[1][i]) for i in np.arange(cardE)] edgeWeightList = theta[np.where(Z==1)] * (2 / max(theta[np.where(Z==1)])) #scaled link prob Pr(Z[i,j]=1) * weight for e in np.arange(len(edgeList)-1): g.add_edge(edgeList[e][0],edgeList[e][1],weight=edgeWeightList[e]) # NODE SIZES # 1. cluster linkage importance #nodesizelist = cluster['linkage'] * (400 / max(cluster['linkage'])) # 2. betweenness centrality (wide range of sizes; very small on periphery) #nodesizelist = np.asarray(list(nx.betweenness_centrality(G,normalized=False).values())) * (400 / max(list(nx.betweenness_centrality(G,normalized=False).values()))) # 3. degree (smaller range of sizes; easier to see on the periphery) nodeSizeList = np.asarray(list(g.degree().values())) * (350 / max(list(g.degree().values()))) #scaled so the largest is size 350 if plotting: # reproducibility np.random.seed(1) #bc = nx.betweenness_centrality(g) E = len(nx.edges(g)) V = len(g) k = round(E/V,3) #size = np.array(list(bc.values())) * 1000 # here replacing the hierarchical magnitude hm['corder'] fignx = plt.figure(figsize=(10,10)) ## use heatmap color groupings to color nodes and heatmap magnitudes to size nodes if layout == 'spring': nx.draw(g, pos=nx.spring_layout(g, scale=graphScale), node_color=nodeColorList, node_size=nodeSizeList, width=edgeWeightList) elif layout == 'fruchterman': nx.draw(g, pos=nx.fruchterman_reingold_layout(g, scale=graphScale), node_color=nodeColorList, node_size=nodeSizeList, width=edgeWeightList) else: print('Please indicate at a valid layout.') #else: #nx.graphviz_layout(g, prog=graphProg) plt.title('Network Created from Induced Rank = %s \n V = %s, E = %s, <k> = %s'%(r,V,E,k), fontweight='bold', fontsize=14) #plot log degree sequence degree_sequence=sorted(nx.degree(g).values(),reverse=True) fig3 = plt.figure(figsize=(10,5)) plt.loglog(degree_sequence) plt.title('Log Degree Distribution', fontweight='bold', fontsize=14) return {'cluster':hm, 'graph':g, 'linkage':hm['linkage'], 'theta':theta, 'A':A, 'Z':Z}
import logging from scipy.io.matlab import loadmat from scipy.sparse import lil_matrix from rescal import rescal_als # Set logging to INFO to see RESCAL information logging.basicConfig(level=logging.INFO) # Load Matlab data and convert it to dense tensor format T = loadmat('data/alyawarradata.mat')['Rs'] X = [lil_matrix(T[:, :, k]) for k in range(T.shape[2])] # Decompose tensor using RESCAL-ALS A, R, fit, itr, exectimes = rescal_als(X, 100, init='nvecs', lambda_A=10, lambda_R=10)
print 'Number of keys for T[1]:', len(t2_keys) for i in range(10): print ' ', t2_keys[i] print 'Size of intersection:', len(set(t1_keys).intersection(set(t2_keys))) print 'Running RESCAL code' rank = 20 maxIter = 100 conv = 1e-5 lambda_A = 0 lambda_R = 0 A, R, _, _, _ = rescal_als( [t[1].copy() for t in T], rank, init='nvecs', conv=conv, lambda_A=lambda_A, lambda_R=lambda_A, maxIter=maxIter, ) print 'A shape:', A.shape print 'R[0] shape:', R[0].shape print 'R size:', len(R) print '\nWriting A and R to disk' a_matrix_file = out_dir + 'a_matrix.tsv' r_matrix_file = out_dir + 'r_matrix.tsv' out = open(a_matrix_file, 'w') for n in range(node_dict.current_index - 1): node = node_dict.getString(n+1)
def netCreate(X, r, minEdges, sampleMethod='deterministic', rescal_lambda_A=10, rescal_lambda_R=10, plotting=True, layout='spring', graphScale=1.0): """Wrapper for sktensor.rescal_als. Create network by given sampleMethod from hierarchical clustering of RESCALA_ALS tensor factorization of singular values matrix A(A^T); Input: X is list of sktensor.csr_matrices [X[k] for k in relationships], each X_k is frontal slide of adjacency tensor (ie, adjacency matrix of one relationship type); Return: {'cluster':hm, 'graph':g, 'linkage':hm['linkage'], 'theta':theta, 'A':A, 'Z':Z} """ #import logging #from rescal import rescal_als #import numpy as np #import pandas as pd #import networkx as nx #import matplotlib.pyplot as plt #from scipy.spatial.distance import pdist, squareform # Set logging to INFO to see RESCAL information logging.basicConfig(level=logging.INFO) # Decompose tensor using RESCAL-ALS A, R, fit, itr, exectimes = rescal_als(X, r, init='nvecs', lambda_A=rescal_lambda_A, lambda_R=rescal_lambda_R) # make the AAT matrix AAT = np.dot(A, A.T) # heatmap hm = heatmap(AAT) plt.suptitle( r'A(A^T) HAC for Induced Rank = %s, $\lambda_{A}$ = %s, $\lambda_{R}$ = %s ' % (r, rescal_lambda_A, rescal_lambda_R), fontweight='bold', fontsize=14) # remove zeros AATnn = AAT AATnn[AATnn < 0] = 0 # remove upper triangle AATnn = np.tril(AATnn, k=-1) # k = -1 to keep only below diagonal # reproducibility np.random.seed(1) # network sampling if sampleMethod == 'Bernoulli': # shortcut: normalize by largest AAT (non-negative) value for separate bernoulli draws # instead of summing over all AAT for multinomial draw, which is harder to # transfer back and forth between vector and triangular matrix theta = AATnn / AATnn.max() # NETWORK SAMPLE ALGORITHM: # random sample ties in network adjacency matrix # one-element-at-a-time Bernoulli shortcut # instead of multinomial sample of entire adjacency n = np.shape(theta)[0] m = np.shape(theta)[1] Z = np.zeros((n, m)) # use dependent row,col permutations to randomly select # elements ij to sample after first full pass through matrix while np.sum(Z) < minEdges: shuffledRows = np.arange(1, n) #up to n rows np.random.shuffle(shuffledRows) # first shuffle rows for i in shuffledRows: # for given row shuffle use lower triangle columns j in that row i shuffledCols = np.arange( i) #up to (i-1) cols, ie, lower triangle np.random.shuffle(shuffledCols) for j in shuffledCols: if Z[i, j] < 1: Z[i, j] = np.random.binomial(n=1, p=theta[i, j], size=1) if np.sum(Z) >= minEdges: break if np.sum(Z) >= minEdges: break elif sampleMethod == 'multinomial': # NETWORK SAMPLING ALGORITHM: # problem: doesn't sufficiently cluster the resulting network draws = int(np.ceil(minEdges * 1.2)) dist = pdist( A) # what matrix to use: pdist(A) or just tril(AAT) directly? invdist = dist invdist[ invdist != 0] = 1 / invdist[invdist != 0] # prevent division by 0 thetavec = invdist / np.sum(invdist) theta = squareform(thetavec) # multinomial sample n = np.shape(theta)[0] Z = np.zeros((n, n)) samp = sampleLinks(q=thetavec, edgesToDraw=1, draws=draws) while np.sum(samp) < minEdges: draws = int(np.ceil(draws * 1.1)) #increase number of draws and try again samp = sampleLinks(q=thetavec, edgesToDraw=1, draws=draws) Z[np.tril_indices_from(Z, k=-1)] = samp elif sampleMethod == 'deterministic': theta = AATnn / AATnn.max() n = np.shape(AATnn)[0] sv = AATnn[np.tril_indices_from( AATnn, k=-1)] #pull singular values from triangle cutOff = topNEdges(data=sv, minEdges=minEdges, n=n)['cutOff'] Z = np.zeros((n, n)) Z[np.where(AATnn >= cutOff)] = 1 else: print( 'No valid sampleMethod selected. Please choose "Bernoulli", "multinomial", or "deterministic" .' ) # NETWORK # Create networkx graph from Z g = nx.Graph() #add nodes with colors of group for n in np.arange(np.shape(hm['corder'])[0] - 1): g.add_node(hm['corder'][n], color=hm['group'][n]) nodeColorList = list(nx.get_node_attributes(g, 'color').values()) #add edges with weight of theta (probability the link exists) cardE = len(np.where(Z == 1)[1]) edgeList = [(np.where(Z == 1)[0][i], np.where(Z == 1)[1][i]) for i in np.arange(cardE)] edgeWeightList = theta[np.where(Z == 1)] * (2 / max( theta[np.where(Z == 1)])) #scaled link prob Pr(Z[i,j]=1) * weight for e in np.arange(len(edgeList) - 1): g.add_edge(edgeList[e][0], edgeList[e][1], weight=edgeWeightList[e]) # NODE SIZES # 1. cluster linkage importance #nodesizelist = cluster['linkage'] * (400 / max(cluster['linkage'])) # 2. betweenness centrality (wide range of sizes; very small on periphery) #nodesizelist = np.asarray(list(nx.betweenness_centrality(G,normalized=False).values())) * (400 / max(list(nx.betweenness_centrality(G,normalized=False).values()))) # 3. degree (smaller range of sizes; easier to see on the periphery) nodeSizeList = np.asarray(list(g.degree().values())) * (350 / max( list(g.degree().values()))) #scaled so the largest is size 350 if plotting: # reproducibility np.random.seed(1) #bc = nx.betweenness_centrality(g) E = len(nx.edges(g)) V = len(g) k = round(E / V, 3) #size = np.array(list(bc.values())) * 1000 # here replacing the hierarchical magnitude hm['corder'] fignx = plt.figure(figsize=(10, 10)) ## use heatmap color groupings to color nodes and heatmap magnitudes to size nodes if layout == 'spring': nx.draw(g, pos=nx.spring_layout(g, scale=graphScale), node_color=nodeColorList, node_size=nodeSizeList, width=edgeWeightList) elif layout == 'fruchterman': nx.draw(g, pos=nx.fruchterman_reingold_layout(g, scale=graphScale), node_color=nodeColorList, node_size=nodeSizeList, width=edgeWeightList) else: print('Please indicate at a valid layout.') #else: #nx.graphviz_layout(g, prog=graphProg) plt.title( 'Network Created from Induced Rank = %s \n V = %s, E = %s, <k> = %s' % (r, V, E, k), fontweight='bold', fontsize=14) #plot log degree sequence degree_sequence = sorted(nx.degree(g).values(), reverse=True) fig3 = plt.figure(figsize=(10, 5)) plt.loglog(degree_sequence) plt.title('Log Degree Distribution', fontweight='bold', fontsize=14) return { 'cluster': hm, 'graph': g, 'linkage': hm['linkage'], 'theta': theta, 'A': A, 'Z': Z }
if not os.path.exists(dest): os.makedirs(dest, exist_ok=True) for nt in range(n_test): print(nt) file_name = os.path.join(dest, 'init_%.3f_rescal_n_dim_%d_%d.txt' % (p_obs, n_dim, nt)) if not os.path.exists(file_name): seq = list() with open('../data/%s/train_%.3f.pkl' % (dataset, p_obs), 'rb') as f: mask = pickle.load(f) X = [csr_matrix(mask[k]) for k in range(n_relation)] for i in range(budget): try: A, R, f, itr, exectimes = rescal.rescal_als(X, n_dim) except: A = np.random.random([n_entity, n_dim]) R = np.random.random([n_relation, n_dim, n_dim]) _X = np.zeros_like(T) for k in range(T.shape[0]): _X[k] = np.dot(np.dot(A, R[k]), A.T) find = False while not find: _X[mask == 1] = -99999999 next_idx = np.unravel_index(_X.argmax(), _X.shape) mask[next_idx] = 1 seq.append(next_idx) if T[next_idx] == 1:
except ValueError: print ' NO_PATH ', print idx_to_name[e] # This is the induced subgraph with the deleted edges. # We can do RESCAL on the dependency matrix of this induced # subgraph. my_subgraph = g.induced_subgraph(cmp_22147) my_subgraph_adj = lil_matrix((my_subgraph.vcount(), my_subgraph.vcount()), dtype='uint8') for eel in my_subgraph.get_edgelist(): my_subgraph_adj[eel[0], eel[1]] = 1 T = [lil_matrix(my_subgraph_adj)] A, R, _, _, _ = rescal_als( T, 10, init='nvecs', conv=1e-3, lambda_A=1, lambda_R=1) idx_of_org_in_cmp_22147 = cmp_22147.index(org) als_prediction = np.dot( np.dot(A[idx_of_org_in_cmp_22147], R[0]), A.T) # Now based on this ranking find out what the rank of # the true employees would be. als_pred_of_employees_removed = [als_prediction[cmp_22147.index(e)] for e in employees_to_remove] sorted_als_pred = sorted( filter( lambda x: cmp_22147_mask[x[0]], enumerate(als_prediction)), reverse=True, key=lambda x: x[1]) sorted_als_pred_idx = [e[0] for e in sorted_als_pred] sorted_als_pred_val = [e[1] for e in sorted_als_pred]
sparse = [lil_matrix((len(entities),len(entities))) for r in relations] ind = 0 for rel in relations: for s in rel: for t in rel[s]: sparse[ind][e2i[s],e2i[t]] = rel[s][t] ind += 1 #plt.show() fname = '{}{}.pkl'.format(sys.argv[1],sys.argv[2]) if os.path.isfile(fname) : A,R,fit,itr,exectimes = pickle.load(open(fname,'r')) else: A, R, fit, itr, exectimes = rescal_als(sparse, int(sys.argv[2]), init='nvecs', conv=1e-6, lambda_A=1, lambda_R=1) pickle.dump([A, R, fit, itr, exectimes],open(fname,'w')) n = A.shape[0] P = zeros((n, n, len(R))) for k in range(len(R)): P[:, :, k] = dot(A, dot(R[k], A.T)) print A.shape print len(R),R[0].shape plt.matshow(P[:,:,0]) #plt.show() A_ = np.mean(A,axis=0) e2vec = {} for e in a2col: e2vec[e] = A[e2i[e],:] print e2vec
print 'Number of keys for T[1]:', len(t2_keys) for i in range(10): print ' ', t2_keys[i] print 'Size of intersection:', len(set(t1_keys).intersection(set(t2_keys))) print 'Running RESCAL code' rank = 20 maxIter = 100 conv = 1e-5 lambda_A = 0 lambda_R = 0 A, R, _, _, _ = rescal_als( [t[1].copy() for t in T], rank, init='nvecs', conv=conv, lambda_A=lambda_A, lambda_R=lambda_A, maxIter=maxIter, ) print 'A shape:', A.shape print 'R[0] shape:', R[0].shape print 'R size:', len(R) print '\nWriting A and R to disk' a_matrix_file = out_dir + 'a_matrix.tsv' r_matrix_file = out_dir + 'r_matrix.tsv' out = open(a_matrix_file, 'w') for n in range(node_dict.current_index - 1): node = node_dict.getString(n + 1)
assert (len(entity) == 14951) rel = data_table[1].unqiue() assert (len(rel) == 1345) entity_n = len(entity) entity_index = {k: v for k, v in enumerate(entity)} data_table.columns = ["head", "rel", "tail"] # Packing it into a three-indices tensor. print("Packaging Tensors...") for r in rel: tensor_slice = np.zeros([entity_n, entity_n]) rel_dataFrame = data_table[data_table["rel"].str.contains(r)] for idx, row in rel_dataFrame.iterrows(): head_index = entity_index[row[0]] tail_index = entity_index[row[2]] tensor_slice[head_index, tail_index] = 1 tensor_slice = lil_matrix(tensor_slice) rel_mat.append(tensor_slice) return rel_mat if __name__ == "__main__": X = read_data("./FB15k/freebase_mtr100_mte100-train.txt") print("Training starts.") A, R, fit, itr, exectimes = rescal_als(X, 100, init='nvecs', lambda_A=10, lambda_R=10)
def run_rescal(graph_dir, out_dir): try_makedirs(out_dir) T, node_dict, edge_dict = read_tensor_from_graph(graph_dir) #T, node_dict, edge_dict = create_test_tensor() print 'Samples from T[0] and T[1]:' import random t1_keys = T[0][1].keys() random.shuffle(t1_keys) print 'Relation for T[0]:', T[0][0] print 'Number of keys for T[0]:', len(t1_keys) for i in range(10): print ' ', t1_keys[i] t2_keys = T[1][1].keys() random.shuffle(t2_keys) print 'Relation for T[1]:', T[1][0] print 'Number of keys for T[1]:', len(t2_keys) for i in range(10): print ' ', t2_keys[i] print 'Size of intersection:', len(set(t1_keys).intersection(set(t2_keys))) print 'Running RESCAL code' rank = 20 maxIter = 100 conv = 1e-5 lambda_A = 0 lambda_R = 0 A, R, _, _, _ = rescal_als( [t[1].copy() for t in T], rank, init='nvecs', conv=conv, lambda_A=lambda_A, lambda_R=lambda_R, maxIter=maxIter, ) print 'A shape:', A.shape print 'R[0] shape:', R[0].shape print 'R size:', len(R) print '\nWriting A and R to disk' a_matrix_file = out_dir + 'a_matrix.tsv' r_matrix_file = out_dir + 'r_matrix.tsv' out = open(a_matrix_file, 'w') for n in range(node_dict.current_index - 1): node = node_dict.getString(n+1) out.write(node) for j in range(rank): out.write('\t') out.write(str(A[n,j])) out.write('\n') out.close() out = open(r_matrix_file, 'w') for index, r in enumerate(R): relation = T[index][0] out.write(relation) out.write('\n') for i in range(rank): for j in range(rank): out.write(str(r[i,j])) if (j < rank - 1): out.write('\t') out.write('\n') out.write('\n') out.close() return A, R, rank, edge_dict, node_dict