def get_industry_matrix(nameOfFile, size): matrix = [] try: num = size except IndexError: num = 504 #Collecting data from the industry.json. This file #gives the details of the industry with open(nameOfFile) as data_file: data = json.load(data_file) g = Graph() #Creating nodes for company in data.keys()[0:num]: g.add_nodes(company,data[company]) #Adding these nodes in a graph by considering #the links. In order to create the links and do significant #comparisons we take into consideration semantic analysis using NLP (spacy). #We compare the different descriptions about a particular company's industry #to other companies and check for the linkages in accordance to those values. g.link_all_industry() #Creating teh matrix for i in xrange(0,num): matrix.append([]) print(str(i)+"\t"+data.keys()[i]) for j in xrange(0,num): a = g.return_links(data.keys()[i],data.keys()[j]) if a is None: a = [] matrix[i].append(len(a)) #Alloting the scores in accordance to the number of links to a given #company mat = np.array(matrix,dtype=float) for i in xrange(0,num): for j in xrange(0,num): if i!=j: if(mat[i][i] != 0): mat[i][j] /= mat[i][i] for i in xrange(0,num): mat[i][i]=1 mat[i] = norm(mat[i],i,num) #mat = (np.round(mat,3)) #Giving the value a 40% percent for the final matrix mat = np.multiply(mat, 0.4) return mat
data = json.load(data_file) print(len(data)) g = Graph() for company in data.keys()[0:num]: g.add_nodes(company,data[company]) g.link_all_nodes() for i in xrange(0,num): matrix.append([]) print(str(i)+"\t"+data.keys()[i]) for j in xrange(0,num): a = g.return_links(data.keys()[i],data.keys()[j]) if a is None: a = [] matrix[i].append(len(a)) mat = np.array(matrix,dtype=float) print mat for i in xrange(0,num): for j in xrange(0,num): if i!=j: mat[i][j] /= mat[i][i] #normalize def norm(array, identity_index, arr_len):
def get_linkage_matrix(nameOfFile, size): matrix = [] try: num = size except IndexError: num = 504 #Collecting data from the links.json. This file essentially has all #the wikipedia links coming out of a particular company's page. with open(nameOfFile) as data_file: data = json.load(data_file) g = Graph() #Creating nodes for company in data.keys()[0:num]: g.add_nodes(company,data[company]) #Adding these nodes in a graph by considering #the common links. Here we check for direct equivalence. Unlike #the industry graph where we take into consideration semantics and using #NLP techniques g.link_all_nodes() for i in xrange(0,num): matrix.append([]) # print(str(i)+"\t"+data.keys()[i]) for j in xrange(0,num): a = g.return_links(data.keys()[i],data.keys()[j]) if a is None: a = [] matrix[i].append(len(a)) mat = np.array(matrix,dtype=float) # print mat #Alloting the scores in accordance to the number of links to a given #company for i in xrange(0,num): for j in xrange(0,num): if i!=j: if mat[i][i] != 0: mat[i][j] /= mat[i][i] # #normalize # def norm(array, identity_index, arr_len): # length = 0 # for i in xrange(0,arr_len): # if i == identity_index: # continue # length += array[i] # if length == 0: # return array # for i in xrange(0,arr_len): # if i == identity_index: # continue # array[i] = array[i]/length # return array #insert 1's after normalization for i in xrange(0,num): mat[i][i] = 1 mat[i] = norm(mat[i],i,num) #mat = np.round(mat,3) mat = np.multiply(mat, 0.6) return mat