def testmethod_4(self): dic = PageRank.relative_font_size("youtube") result = PageRank.search("youtube") if result == [-1]: self.assertNotEqual(len(dic), 1, "fail") else: self.assertNotEqual(len(dic), 0, "fail")
def testmethod_1(self): dic = PageRank.hits("zhu") result = PageRank.search("zhu") if result == [-1]: self.assertNotEqual(len(dic), 1, "fail") else: self.assertNotEqual(len(dic), 0, "fail")
def test_PR(): IGgraph1=PageRank.getIGraph("MATCH (n:`andra-user`)<-[r:`andra-from`]-(t1:`andra-tweet`)-[re:`andra-retweet`]->(t2:`andra-tweet`)-[rel:`andra-from`]->(p:`andra-user`) WHERE t2.language="'"fr"'" RETURN n as nodeFrom,p as nodeTo",\ grapheNeo,"192.168.1.75:7474","neo4j", "pass4dbse") IGgraph2=PageRank.getIGraph("MATCH (n:`andra-tweet`)-[re:`andra-retweet`]->(p:`andra-tweet`) RETURN n as nodeFrom,p as nodeTo",\ grapheNeo,"192.168.1.75:7474","neo4j", "pass4dbse") assert type(IGgraph1) == igraph.Graph assert len(IGgraph1.vs) != 0 assert len(IGgraph1.es) != 0 assert type(IGgraph2) == igraph.Graph assert len(IGgraph2.vs) != 0 assert len(IGgraph2.es) != 0 PRank1 = PageRank.Rank(IGgraph1, 10) PRank2 = PageRank.Rank(IGgraph2, 10) assert type(PRank1) == dict assert len(PRank1['classement']) == 10 assert type(PRank1['classement']) == list assert type(PRank1['resPR']) == list assert len(PRank1['resPR']) != 0 assert type(PRank2) == dict assert len(PRank1['classement']) == 10 assert type(PRank2['classement']) == list assert type(PRank2['resPR']) == list assert len(PRank2['resPR']) != 0 print "PAGERANK OK"
def pesquisar(self): self.tela.wm_title('Googlis!') self.tela.wm_minsize(width=500, height=250) tkinter.Label(self.tela, text=self.texto, font=self.fonte).pack(side='top') ##### IMAGEM img = tkinter.PhotoImage(file='googlis.png') logo = tkinter.Button(self.tela, image=img) logo.image = img logo.place(x=190, y=30) ##### BOTÕES busca = tkinter.Entry(self.tela) busca.place(x=150, y=150, width=200) botao_1 = tkinter.Button( self.tela, text='Pesquisa Googlis!', command=lambda: PageRank.PageRank(busca.get()).rank_it()) botao_2 = tkinter.Button(self.tela, text='Estou com sorte!') botao_1.place(x=130, y=200) botao_2.place(x=270, y=200)
def main(): data = read() M = data2matrix(data) v = PageRank.pagerank(M, 0.001, 0.85) funcDict = {} for i in xrange(counter): funcDict[id2func[i]] = v[i][0] funcDict = OrderedDict(sorted(funcDict.items(), key=lambda t: -t[1])) with open(os.path.join(resultFolder, resultFile), 'w') as pageRankFile: json.dump(funcDict, pageRankFile, indent=4)
class SearchEngine: def __init__(self): indexer = Indexer() self.graph = Graph() self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html", "http://mysql12.f4.htw-berlin.de/crawl/d06.html", "http://mysql12.f4.htw-berlin.de/crawl/d08.html"}, self.graph, indexer) self.crawler.crawl() self.scorer = Scorer(indexer.index, indexer.documents) self.pageRank = PageRank(self.graph) self.pageRank.calc() def search (self, string, scoreOnly = False): query = string.split() scores = self.scorer.scoreQuery(query) if scoreOnly: results = scores else: results = {} for url, score in scores.items(): results[url] = score * self.graph.get_document(url).rank sortedResults = sorted(results.items(), key=operator.itemgetter(1), reverse = True) for res in sortedResults: print(res) def printPageRanks(self): print('Page ranks:') print(' d01 - d02 - d03 - d04 - d05 - d06 - d07 - d08') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d01.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d02.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d03.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d04.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d05.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d06.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d07.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d08.html").rank, 4), end = '\n\n')
def results(): if request.method == 'POST': MAX_RESULTS = 20 searchQuery = request.form['SearchQuery'] #get text entered results = PageRank.getRelevantResults(searchQuery, Table_InvertedIndex, Table_Webpages, MAX_RESULTS) return render_template( "results.html", results=results) #pass list of results for rendering
def run(self, alpha, save_dir=""): pg = PageRank(self.itr, self.error, alpha) author_resvec = pg.run(self.author_mat, self.author_init, self.author_len) self.author.ScoreToName("Author", author_resvec, save_dir + "author_page_rank.txt") author_resmap = self.author.getResult() paper_resvec = pg.run(self.paper_mat, self.paper_init, self.paper_len) self.paper.ScoreToName("self.paper", paper_resvec, save_dir + "paper_page_rank.txt") paper_resmap = self.paper.getResult() self.venue_resvec = pg.run(self.venue_mat, self.venue_init, self.venue_len) self.venue.ScoreToVenue(self.venue_resvec, save_dir + "venue_rank.txt") venue_resmap = self.venue.getResult() return author_resmap, paper_resmap, venue_resmap
def run(self): self.output.config(state=NORMAL) self.output.delete(1.0, END) try: result, htmlPageNames = PageRank.PageRank(float(self.aEntry.get()), currentFile) for item in reversed(range(len(result))): self.output.insert('1.0', htmlPageNames[item] + ': ' + str(round(float(result[item]), 3)) + '\n') # self.output.insert('1.0', result) self.output.insert('1.0', 'Importance of pages:' + '\n') except: self.output.insert('1.0', 'Invalid input.') self.output.config(state=DISABLED)
def links(query): text=[] f=open('./results.txt','w')#wipes out results from previous query in order to prevent those results from adding to a new user query f.write('<p style="font-family:FreeSans;font-size:120%;color:black">') f.close()#inserts a line of code that displays the text result in the format named FreeSans at 1.2 times the original size before closing the file. PageRank.search(query)#calls the query function in PageRank.py that computes and stores the results in "results.txt" f = open("./results.txt") f2=open('./numbers.txt')#contains the total number of results for the user's inputted query for line in f:#reads text file and copies contents onto a tuple (named 'text'), with each element of the tuple representing one line in the textfile. word = line.strip() text.append(word) f.close() for line in f2:#reads the number of results determined and saves it onto a vvariable named 'word2'. word2 = line.strip() f.close() html = '<p style="font-family:FreeSans;font-size:120%;color:black">'+"Your query '"+query+"' returned the following "+word2+" result(s):<br/>" if(query.find(' ')==1): html=html+"NOTE: Since you typed MORE THAN ONE keyword, the following results may not be acurate.<br/>" html=html+"<br/>" html=html+"<br/>".join(text) return html#displays results to the user
def __init__(self): indexer = Indexer() self.graph = Graph() self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html", "http://mysql12.f4.htw-berlin.de/crawl/d06.html", "http://mysql12.f4.htw-berlin.de/crawl/d08.html"}, self.graph, indexer) self.crawler.crawl() self.scorer = Scorer(indexer.index, indexer.documents) self.pageRank = PageRank(self.graph) self.pageRank.calc()
def PageRank(self): #Specify parameters of Page Rank algorithm iterations = 20 initial_pr = 1.0 if self.dataBaseConnection.cursor(): #Fetch all links from persistent file, sort them using PageRank and store them into rankedList self.cursor.execute('SELECT * FROM Links;') myData = self.cursor.fetchall() rankedList = PageRank.page_rank(myData, iterations, initial_pr) for x in rankedList: self.cursor.execute( """INSERT OR REPLACE INTO PageRank (doc_id, rank) VALUES('%s', '%s');""" % ( x, rankedList[x]) ) # Use INSERT OR REPLACE to prevent duplicate self.dataBaseConnection.commit()
def experiment_lambda_rounds(airports_hash, airports_sink, results, init_function, init_type): for i in range(5, 10): lambda_value = i*0.1 time1 = time.time() init_function(airports_hash) iterations = pr.compute_page_ranks( airports_hash=airports_hash, airports_sink=airports_sink, lambda_value=lambda_value, init_ranks=init_function) time2 = time.time() t = time2 - time1 if init_type not in results: results[init_type] = {} lambda_str = '{:.1f}'.format(lambda_value) results[init_type][lambda_str] = {} results[init_type][lambda_str]['iterations'] = iterations results[init_type][lambda_str]['time'] = t results[init_type][lambda_str]['ranks'] = extract_results_airports(airports_hash)
def __init__(self, layers=None, interLayers=None, weights=None): """ Construct a Mulet with a list of individual layers. weights represent the inter layer edges between different layers. :return: None """ if layers is not None: self.layers = layers if weights is not None: self.weights = weights if interLayers is not None: self.interLayers = interLayers self.getGenericGraphfromLayers() self.detCumulativeIntraLayerAcceptance() self.updateCumulativeAcceptance() self.detCumulativeInterLayerAcceptance() self.detCumulativeIntraLayerRejectance() self.updateCumulativeRejectance() self.detCumulativeInterLayerRejectance() finIp = InfluencePassivity(filename=None) finIp.InfluencePassivityAlgorithm(mygraph=self.g, Avals=self.A_delta, Rvals=self.R_delta) with open( '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Influences.json', 'w') as outfile: json.dump(finIp.I, outfile) with open( '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Passivities.json', 'w') as outfile: json.dump(finIp.P, outfile) print('Sum I:::', str(max((finIp.I.values())))) print('Sum P:::', str(max((finIp.P.values())))) pr = PageRank(directional=True) pr.modifyGraph(self.g) pr.pageRankAlgorithm(m=10) with open( '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Authority.json', 'w') as outfile: json.dump(pr.a, outfile) with open( '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Hub.json', 'w') as outfile: json.dump(pr.h, outfile) print('pagerank a:::', max(pr.a.values())) print('pagerank h:::', max(pr.h.values()))
def testmethod_5(self): result = PageRank.trim_url("http://en.wikipedia.org/wiki/Hotmail") self.assertEqual(result, "wikipedia.org")
(authority, hubness) = HITS.HITS(graph) tEnd = time.time() timeCost = tEnd - tStart with open("output.txt", 'w', encoding='UTF-8') as f: f.write("time cost: %f\n" % timeCost) f.write("HITS\n") f.write(" authority, hubness\n") for key in graph: f.write("%s: " % key) f.write("%f, " % authority[key]) f.write("%f\n" % hubness[key]) f.close() elif method == 'pagerank': damp = 0.15 tStart = time.time() pageRank = PageRank.PageRank(graph, damp) tEnd = time.time() timeCost = tEnd - tStart with open("output.txt", 'w', encoding='UTF-8') as f: f.write("time cost: %f\n" % timeCost) f.write("PageRank\n") for key in pageRank: f.write("%s: " % key) f.write("%f\n" % pageRank[key]) f.close() elif method == 'simrank': c = 0.8 #decay factor tStart = time.time() simMatrix = SimRank.SimRank(graph, c) tEnd = time.time() timeCost = tEnd - tStart
import networkx as nx import sys sys.path.append('..') import PageRank G = nx.DiGraph() G.add_nodes_from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) G.add_edge(1, 2) G.add_edge(1, 4) G.add_edge(1, 3) G.add_edge(2, 3) G.add_edge(4, 3) G.add_edge(1, 5) G.add_edge(2, 1) G.add_edge(5, 2) G.add_edge(3, 4) G.add_edge(2, 4) PR = PageRank.PageRank(G) PR.constructDispersionMatrix(G) print(PR.getPageRank()) print(sum(PR.getPageRank()))
def main(): WebCrawler.main() Indexer.main() PageRank.main()
return 1 return count for s in range(len(sentences)): if s > 0: adjMax[s][s - 1] += findRelation(sentences[s]) * 0 for ss in range(len(sentences)): if s != ss: adjMax[s][ss] += findOverlap(sentences[s], sentences[ss]) # for m in adjMax: # print " ".join([str(i) for i in m]) G = np.array(adjMax) rank = PageRank.pageRank(G, s=0.5) l = [] ind = 0 for r in rank: l.append((r, ind)) ind += 1 l.sort() l.reverse() print words_used # print all inportant sentences for k in l: print sentences[k[1]] print k[0] print "==============="
def test_calc(self): self.assertTrue(P.get_weight()) '''
import numpy as np data = open( 'E:\\learning\\WebDataMining\\hw\\aan\\release\\2014\\venue_test.txt') # pattern of the venue p1 = r"(?<=id\s=\s{).+?(?=})" pattern1 = re.compile(p1) p2 = r"(?<=venue\s=\s{).+?(?=})" pattern2 = re.compile(p2) # venue venue = MapToMatrix('E:\\learning\\WebDataMining\\hw\\aan\\release\\2014\\') venue.FileToList_Reg('venue_test.txt', pattern1, pattern2) venue_len = venue.getLen() venue_mat = venue.SwitchMap('paper_venue_test.txt') venue_init = np.ones(venue_len) pg = PageRank(50, 0.0001, 0.6) venue_resvec = pg.run(venue_mat, venue_init, venue_len) venue.ScoreToVenue(venue_resvec, "exp_venue_rank.txt") venue_resmap = venue.getResult() venue_topn = topN(3, venue_resmap) sum_error = compareDiff(venue_topn, venue_topn) print sum_error # # # alpha=0.6 # topn=10 # print "when alpha=%f, the top %d is \n"%(alpha,topn)
def __init__(self, fichier, findParametre=False, metrique="FMesure", tailleTrain=0.65, verbose=False): """ Permet d'initialiser EvalAllIRModel :type fichier: String :param fichier: le fichier ou sont stocker les queries, :type findParametre: boolean :param findParametre: boolean pour activer l'optimisation des parametres :type metrique: String :param metrique: La metrique que l'on veux utiliser :type tailleTrain: float :param tailleTrain: la proportion de train sur l'ensemble des queries :type verbose: boolean :param verbose: boolean pour activer le mode verbeux """ collection = Parser.Parser.buildDocCollectionSimple(fichier + '.txt', pageRank=True) self.collectionQry = Parser.Parser.buildQueryCollection(fichier) self.train = dict() self.verbose = verbose self.print_verbose("Initialisation") if findParametre: self.separeTrainTest(tailleTrain) index = IndexerSimple.IndexerSimple() index.indexation(collection) self.weighter = [ Weighter.Weighter1(index), Weighter.Weighter2(index), Weighter.Weighter3(index), Weighter.Weighter4(index), Weighter.Weighter5(index) ] modelIR = [IRModel.Vectoriel, IRModel.Jelinek_Mercer, IRModel.Okapi] model = [] for w in self.weighter: for m in range(len(modelIR)): if m == 1: # pour le modèle Jelinek_Mercer jelinek = modelIR[m](w) if findParametre: jelinek.findParametreOptimaux(np.arange(0, 1.4, 0.1), self.train, metrique) model.append(jelinek) elif m == 2: # pour le modèle Okapi okapi = modelIR[m](w) if findParametre: okapi.findParametreOptimaux(np.arange(0, 0.5, 0.1), np.arange(1.5, 2, 0.1), self.train, metrique) model.append(okapi) else: # pour le modèle Vectoriel model.append( modelIR[m](w)) # il n'y a pas de parametre a optimiser model.append(modelIR[m](w, True)) if findParametre: self.print_verbose( "Tout les models Jelinek_Mercer et Okapi sont entrainer") self.model = [] for m in model: self.model.append(EvalIRModel(self.collectionQry, m)) pr = PageRank.PageRank(m.getWeighter(), m) if findParametre: pr.findParametreOptimaux(np.arange(0.85, 0.95, 0.05), self.train, metrique) self.model.append(EvalIRModel(self.collectionQry, pr)) if findParametre: self.print_verbose("Tout les models PageRank sont entrainer") self.print_verbose("Initialisation terminer")
def leave_one_out(function, diseaseGeneFilePath, PPI_Network, param): print("Starting leaveOneOut function") # building list of disease genes diseaseGeneFile = open(diseaseGeneFilePath, 'r') allDiseaseGenes = diseaseGeneFile.read().splitlines() diseaseGeneFile.close() # print(allDiseaseGenes) numDiseaseGenes = len(allDiseaseGenes) rankThreshhold = 150 numGenesNotFound = 0 degree_list = [] #remove after graph is made (kate) in_out_list = [] #remove after graph is made (kate) graph_nodes = list(PPI_Network.nodes()) startVector = load_start_vector(diseaseGeneFilePath, PPI_Network) startVector = (numDiseaseGenes / (numDiseaseGenes - 1)) * startVector # skipping for index, skipGene in enumerate(allDiseaseGenes): # find the skip gene in the start vector, make it zero index = graph_nodes.index(skipGene) node_degree = PPI_Network.degree( skipGene) #remove after graph is made (kate) degree_list.append(node_degree) #remove after graph is made (kate) newStartVector = startVector.copy() newStartVector[index] = 0 # startVector[index] = 0 priors_vector = np.zeros(PPI_Network.number_of_nodes()) if function == pr.page_rank: priors_file_path = find_priors_file(diseaseGeneFilePath) priors_vector = pr.load_priors(priors_file_path, PPI_Network) priors_vector[index] = 0 #run algorithm using modified disease gene file startTime = time.time() output = [] if function == pr.page_rank: output = function(PPI_Network, newStartVector, priors_vector, param) else: print("sum of start vector:", np.sum(startVector)) output = function(PPI_Network, newStartVector, param) endTime = time.time() print("finished algorithm. Time elapsed:", endTime - startTime) #find the predicted probability of the omitted gene and add it to the current sum startTime = time.time() foundGene = False for i in range(rankThreshhold): if output[i][0] == skipGene: foundGene = True print("Found the gene: ", skipGene, "at rank: ", i) in_out_list.append(1) #remove after graph is made (kate) break if not foundGene: numGenesNotFound += 1 in_out_list.append(-1) #remove after graph is made (kate) endTime = time.time() # write the results of leave one out to a file disease_name = diseaseGeneFilePath.split(".")[0] output_name = "leave_one_out_1" + disease_name[5:] if function == pr.page_rank: output_name = output_name + "_pr.tsv" elif function == dk.diffusion_kernel: output_name = output_name + "_dk.tsv" elif function == rwr.random_walk: output_name = output_name + "_rwr.tsv" with open(output_name, "w") as output: for i in range(len(allDiseaseGenes)): output_string = allDiseaseGenes[i] + "\t" + str( degree_list[i]) + "\t" + str(in_out_list[i]) + "\n" output.write(output_string) print( "------------------------\nFinished running algorithm with all disease genes left out\nCalculating mean squared difference" ) print("Num genes not found for this run of leave one out: ", numGenesNotFound) #Find average of all squared differences percentCorrectlyRankedGenes = 1 - numGenesNotFound / numDiseaseGenes return percentCorrectlyRankedGenes
def graphAnalyzer(graph, kmeans=False): """Argument: the path to find a .gml graph file, boolean : if yes using scikit-learn KMean to cluster otherwise using our dbscan algorithm. Will page rank and cluster the nodes in order to return the highest page rank page in the three biggest clusters It also print a graph in order to visualize the clustering""" G = nx.read_gml(graph) G = removeIsolatedNodes(G) # removing meaningless nodes G.remove_node(list(G.nodes)[0]) # ----------------------------------- PageRank Computation -------------------------------------- # creating a PageRank object pr = PageRank.PageRank(G) pr.constructDispersionMatrix(G) pr = pr.getPageRank() # ----------------------------------- Clustering Computation -------------------------------------- # constructing network layout forceatlas2 = fa2.ForceAtlas2( # Behavior alternatives outboundAttractionDistribution=False, # Dissuade hubs linLogMode=False, # NOT IMPLEMENTED adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED) edgeWeightInfluence=0, # Performance jitterTolerance=.01, # Tolerance barnesHutOptimize=True, barnesHutTheta=1.2, multiThreaded=False, # NOT IMPLEMENTED # Tuning scalingRatio=1, strongGravityMode=True, gravity=200, # Log verbose=True) pos = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=1000) if kmeans: # converting positions into a list of np.array pos_list = [np.array([elt[0], elt[1]]) for key, elt in pos.items()] # clustering the nodes according to the kmeans algorithm clusters = Kmeans.kmeans(pos_list, 8, 0.01, 300) else: pos = {key: np.array([elt[0], elt[1]]) for key, elt in pos.items()} pos_transf = dbscan.transf( pos) # changing position format to be able to use it in DBSCAN clusters = dbscan.dbscan(pos_transf, 40, 20) # clustering cluster_with_pr = associatingPageRankToNode(pr, clusters) # sorting each cluster according to page rank result for key, value in cluster_with_pr.items(): cluster_with_pr[key] = sorted(value, key=lambda item: (item[1], item[0])) # rendering the suggested pages and their page rank print("\nThe recommanded pages are the following :") for key, value in cluster_with_pr.items(): try: node_index = value[-1][0] # retrieving the node index title_node = re.search( r'titles=(.*?)\&', list(G.nodes()) [node_index]) # getting the title of the Wikipedia page print("•", title_node.group(1), "- with a page rank of ", value[-1][1]) except IndexError: pass # ----------------------------------- Graph Creation -------------------------------------- # each node within a cluster have the same color get_colors = lambda n: list( map(lambda i: "#" + "%06x" % random.randint(0, 0xFFFFFF), range(n))) colors = get_colors(len(clusters.keys()) + 1) node_color = ['black' for _ in range(len(G.nodes()))] for key, value in clusters.items(): for elt in value: node_color[elt] = colors[key] nx.draw(G.to_undirected(), pos, node_size=2, width=.05, edge_color='grey', node_color=node_color) plt.savefig("graph_with_layout.png")
if __name__ == '__main__': if len(sys.argv) != 2: print "usage: ./TopicSensitivePageRank.py <file to read json data(tweets)>" sys.exit(1) filename = sys.argv[1] f = file(filename, "r") tweets = f.readlines() #final_list = findMostPopularWords(tweets) dict_buckets = staticDictOfBuckets() tagged_dict = taggedUsers_dict(tweets, dict_buckets) #print tagged_dict pageRank = PageRank() global_dict = pageRank.create_dictionary(tweets) idToUserMap = pageRank.map_IDtoUsername(tweets) updated_dict = update_dictionary(global_dict, tagged_dict) for tag in updated_dict: print tag, len(updated_dict[tag]) #print updated_dict_social final_dict_social = pageRank.update_dictionary(updated_dict['technology']) print final_dict_social final_list_social = sorted(final_dict_social.items(), key=lambda x: x[1], reverse=True)
__author__ = 'Ariel' import numpy as np import time import readHelper import writeHelper import PageRank start_time = time.time() # get teleportation matrix m = readHelper.getSparseMatrix('transition.txt',True) # global PageRank globalPR, outGPR = PageRank.pagerank(m, 0.1) # out-line link injection for topic sensitive PageRank topic = readHelper.getSparseMatrix('doc-topics.txt', False).transpose() tspr, outTSPR = PageRank.topicSensitivePageRank(m, topic, 0.25, 0.65) # query topic sensitive PageRank queryTopic, queryDistr = readHelper.getDistro('query-topic-distro.txt') outQTSPR = PageRank.OnlineTopicSensitivePR(outTSPR, queryDistr[queryTopic[(2,2)]]) queryTopicPR = PageRank.OnlineTopicSensitivePR(tspr, queryDistr) # user topic sensetive PageRank userTopic, userDistr = readHelper.getDistro('user-topic-distro.txt') outPTSPR = PageRank.OnlineTopicSensitivePR(outTSPR, userDistr[userTopic[(2,2)]]) userTopicPR = PageRank.OnlineTopicSensitivePR(tspr, userDistr)
if test_matrix: P = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]]) node_count = P.shape[0] x = np.ones(node_count) weights, iter_count = PageRank.power_iter_matrix(x, graph=P, epsilon=1e-20, d=0.85, max_iter=1000) print("Weights:") print(weights) print("Iteration count:") print(iter_count) else: P = None # P = {(1,2), (2,1), (3,0), (3,1), (4,3), (4,1), (4,5), (5,1), (5,4), (6,1), (6,4), (7,1), (7,4), (8,1), (8,4), (9,4), (10,4)} # node_count = 11 with open(file_name, 'r') as f: line_count = sum(1 for line in f) P = np.zeros(line_count, dtype='int32, int32') f.seek(0) for i, line in enumerate(f):
def summarise(filepath, co_ref=1, page_rank=True, debug_output=True, num_words=200, overlap=True): if not os.path.isdir("stanford-corenlp"): print >> sys.stderr, "Please put the Stanford CoreNLP package into the stanford-corenlp directory." quit() filename = filepath.split("/")[-1] if not os.path.isfile("stanford-corenlp/" + filename + ".xml"): shutil.copyfile(filepath, "stanford-corenlp/" + filename) if os.name == "nt": os.system( "cd stanford-corenlp && java -cp stanford-corenlp-3.2.0.jar;stanford-corenlp-3.2.0-models.jar;xom.jar;joda-time.jar;jollyday.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -file " + filename) else: os.system( "cd stanford-corenlp && java -cp stanford-corenlp-3.2.0.jar:stanford-corenlp-3.2.0-models.jar:xom.jar:joda-time.jar:jollyday.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -file " + filename) sentences, coref = splitAndParse.splitSentencesAndParse( "stanford-corenlp/" + filename + ".xml") if debug_output: print "coref", coref adjMax = [[0 for s in sentences] for s in sentences] if co_ref: for co in coref: dic = {} for s in co: if s in dic: dic[s] += 1 else: dic[s] = 1 for s in dic: for ss in dic: if s != ss: if co_ref == 1: adjMax[s][ss] += (dic[s] + 0.0) / dic[ss] elif co_ref == 2: adjMax[s][ss] += (dic[ss] + 0.0) / dic[s] elif co_ref == 3: adjMax[s][ss] += dic[ss] + dic[s] elif co_ref == 4: adjMax[s][ss] += dic[ss] * dic[s] elif co_ref == 5: adjMax[s][ss] += (dic[ss] + dic[s]) * 2 elif co_ref == 6: adjMax[s][ss] += (dic[ss] + dic[s]) * 5 words_used = set() if debug_output: print "before overlap" for m in adjMax: print " ".join([str(i) for i in m]) if overlap: for s in range(len(sentences)): for ss in range(len(sentences)): if s != ss: adjMax[s][ss] += findOverlap(sentences[s], sentences[ss], words_used) if debug_output: print "after overlap" for m in adjMax: print " ".join([str(i) for i in m]) l = [] scores = [] if page_rank: G = np.array(adjMax) scores = PageRank.zeroToOne(G, s=0.5) else: scores = [sum(row) for row in adjMax] ind = 0 for s in scores: l.append((s, ind)) ind += 1 l.sort() l.reverse() if debug_output: print words_used # print all inportant sentences for k in l: print sentences[k[1]] print k[0] print "===============" best_first = cutoff_words(l, sentences, 200, adjMax) by_order = [] for i in range(0, len(best_first)): by_order.append((l[i][1], best_first[i])) by_order.sort() return [s[1] for s in by_order]
from PageRank import * import time start_time = time.time() ##data_file = "Datasets/sx-mathoverflow.txt" data_file = "Datasets/Wiki-Vote.txt" ##data_file = "Datasets/test.txt" is_page_no_zero_indexed = False epsilon = 0.00001 max_iterations = 10 beta = 0.85 display_network_after_each_iteration = True max_no_of_nodes_to_show = 20 PgRank = PageRank(data_file, is_page_no_zero_indexed, max_iterations, beta, epsilon, display_network_after_each_iteration) print("Rank Vector:") for i in PgRank.rank_vector[:max_no_of_nodes_to_show]: print(i) PgRank.display_network(PgRank.rank_vector, max_no_of_nodes_to_show) if(is_page_no_zero_indexed): teleport_set = [i for i in PgRank.matrix if i%500==0] else: teleport_set = [i+1 for i in PgRank.matrix if i%500==0] topic_specific_rank_vector = PgRank.topic_specific_page_rank(teleport_set) print("Topic Specific Rank Vector:") for i in topic_specific_rank_vector[:max_no_of_nodes_to_show]: print(i) end_time = time.time()
def testmethod_1(self): result = PageRank.search("youtube") self.assertNotEqual(len(result),0,"fail")
listS5, best_avg5 = clique.find_densest_subgraph(G) end5 = time.clock() time5 = end5 - beg5 L4 = [] for x in range(0, len(listS5)): for y in listS5[x]: L4.append(y) L4 = list(set(L4)) #print L4 recall_Clique = len(list(golden_keywords.intersection(L4))) / float( len(golden_keywords)) precision_Clique = len(list(golden_keywords.intersection(L4))) / float(len(L4)) # PageRank beg7 = time.clock() list7 = pr.find_densest_subgraph(G, 10) end7 = time.clock() time7 = end7 - beg7 temp = [list7[i][0] for i in range(len(list7))] recall_PageRank = len(list(golden_keywords.intersection(temp))) / float( len(golden_keywords)) precision_PageRank = len(list(golden_keywords.intersection(temp))) / float( len(temp)) #Using the function from Networkx K-Cores beg4 = time.clock() G.remove_edges_from(G.selfloop_edges()) S4 = nx.k_core(G) end4 = time.clock() time4 = end4 - beg4 temp = S4.nodes()
def test_influence(): resIGFG = DetectionCommunautes.DetectionComIG(gTwitter, "FastGreedy") resIGIM = DetectionCommunautes.DetectionComIG(gTwitter, "InfoMap") resSNAPBC = DetectionCommunautes.DetectionComSNAP(gTwitter, "BigClam", path + 'snap/', path, 'outputGraph.txt', 'outputAlgo.txt') resSNAPIM = DetectionCommunautes.DetectionComSNAP(gTwitter, "InfoMap", path + 'snap/', path, 'outputGraph.txt', 'outputAlgo.txt') resSNAPCPM = DetectionCommunautes.DetectionComSNAP(gTwitter, "CPM", path + 'snap/', path, 'outputGraph.txt', 'outputAlgo.txt') NomComFG = InfluenceCommunautes.NomsCommunautes(resIGFG['membership'], gTwitter) NomComIM = InfluenceCommunautes.NomsCommunautes(resIGIM['membership'], gTwitter) assert type(NomComFG['comNoms']) == dict assert type(NomComFG['comNode']) == dict assert len(NomComFG['comNoms']) != 0 assert len(NomComFG['comNode']) != 0 assert type(NomComIM['comNoms']) == dict assert type(NomComIM['comNode']) == dict assert len(NomComIM['comNoms']) != 0 assert len(NomComIM['comNode']) != 0 IGgraph1=PageRank.getIGraph("MATCH (n:`andra-user`)<-[r:`andra-from`]-(t1:`andra-tweet`)-[re:`andra-retweet`]->(t2:`andra-tweet`)-[rel:`andra-from`]->(p:`andra-user`) WHERE t2.language="'"fr"'" RETURN n as nodeFrom,p as nodeTo",\ grapheNeo,"192.168.1.75:7474","neo4j", "pass4dbse") PRank1 = PageRank.Rank(IGgraph1, 10) InfluFG = InfluenceCommunautes.InfluenceCommunautes( grapheNeo, gTwitter, PRank1['resPR'], NomComFG['comNode'], 0.2, 10) InfluIM = InfluenceCommunautes.InfluenceCommunautes( grapheNeo, gTwitter, PRank1['resPR'], NomComIM['comNode'], 0.2, 10) InfluBC = InfluenceCommunautes.InfluenceCommunautes( grapheNeo, gTwitter, PRank1['resPR'], resSNAPBC['comCodes'], 0.2, 10) InfluIMS = InfluenceCommunautes.InfluenceCommunautes( grapheNeo, gTwitter, PRank1['resPR'], resSNAPIM['comCodes'], 0.2, 10) InfluCPM = InfluenceCommunautes.InfluenceCommunautes( grapheNeo, gTwitter, PRank1['resPR'], resSNAPCPM['comCodes'], 0.2, 10) assert type(InfluFG) == dict assert type(InfluIM) == dict assert type(InfluBC) == dict assert type(InfluIMS) == dict assert type(InfluCPM) == dict InfluHTFG = InfluenceCommunautes.InfluenceHashtags(grapheNeo, NomComFG['comNode'], 0.2) InfluHTIM = InfluenceCommunautes.InfluenceHashtags(grapheNeo, NomComIM['comNode'], 0.2) InfluHTBC = InfluenceCommunautes.InfluenceHashtags(grapheNeo, resSNAPBC['comCodes'], 0.2) InfluHTIMS = InfluenceCommunautes.InfluenceHashtags( grapheNeo, resSNAPIM['comCodes'], 0.2) InfluHTCPM = InfluenceCommunautes.InfluenceHashtags( grapheNeo, resSNAPCPM['comCodes'], 0.2) assert type(InfluHTFG) == dict assert type(InfluHTIM) == dict assert type(InfluHTBC) == dict assert type(InfluHTIMS) == dict assert type(InfluHTCPM) == dict assert len(InfluHTFG) == len(NomComFG['comNode']) assert len(InfluHTIM) == len(NomComIM['comNode']) assert len(InfluHTBC) == len(resSNAPBC['comNoms']) assert len(InfluHTIMS) == len(resSNAPIM['comNoms']) assert len(InfluHTCPM) == len(resSNAPCPM['comNoms']) InflueTweeFG = InfluenceCommunautes.InfluenceTweets(NomComFG['comNode']) InflueTweeIM = InfluenceCommunautes.InfluenceTweets(NomComIM['comNode']) InflueTweeBC = InfluenceCommunautes.InfluenceTweets(resSNAPBC['comCodes']) InflueTweeIMS = InfluenceCommunautes.InfluenceTweets(resSNAPIM['comCodes']) InflueTweeCPM = InfluenceCommunautes.InfluenceTweets( resSNAPCPM['comCodes']) assert type(InflueTweeFG) == dict assert type(InflueTweeIM) == dict assert type(InflueTweeBC) == dict assert type(InflueTweeIMS) == dict assert type(InflueTweeCPM) == dict assert len(InflueTweeFG) == len(NomComFG['comNode']) assert len(InflueTweeIM) == len(NomComIM['comNode']) assert len(InflueTweeBC) == len(resSNAPBC['comNoms']) assert len(InflueTweeIMS) == len(resSNAPIM['comNoms']) assert len(InflueTweeCPM) == len(resSNAPCPM['comNoms']) print "INFLUENCE OK"
meanDegree = (2 * numberOfEdges) / numberOfNodes return meanDegree / (numberOfNodes - 1) #deleted 5,6,7,11 #8=5, 9=6, 10=7, 12=8, 13=9, 14=10, 15=11, 16=12, 17=13, 18=14 Friends = [[2], [1, 4, 9, 11], [4], [2, 3], [6], [5], [8], [7, 9, 10], [2, 8, 10], [8, 9], [2, 12], [11], [14], [13]] #deleted 5,6,7,11 # 8=5,9=6,10=7,12=8,13=9,14=10,15=11,16=12,17=13,18=14 FirstNames = [[2, 4, 8], [1, 4, 9, 11], [4], [1, 2, 3, 13, 14], [6], [5], [8, 13], [1, 7, 9, 10], [2, 8, 10], [8, 9, 14], [2, 12], [11], [4, 7, 14], [4, 10, 13]] #deleted 1,5,8,9,11,15,16 # 2=1, 3=2, 4=3, 6=4, 7=5, 10=6, 12=7, 13=8, 14=9, 17=10, 18=11 HaveClass = [[3, 8], [3], [1, 2, 10, 11], [5], [4], [7], [6, 8], [1, 7, 11], [11], [3, 11], [3, 8, 9, 10]] #deleted 3,5,6,7,11,14,15,16 # 4=3,8=4,9=5,10=6,12=7,13=8,17=9,18=10 SocialEvents = [[2], [1], [9], [5], [4], [7, 9], [6, 8], [7], [3, 6, 10], [9]] #print(HW1.out_degree(Friends)) #print(HW1.in_degree(Friends)) PR.PageRank(Friends) PR.PageRank(FirstNames) PR.PageRank(HaveClass) PR.PageRank(SocialEvents)
def testmethod_6(self): result = PageRank.search("ruosyguweryiotgryu") self.assertEqual(len(result), 1, "fail")
def testmethod_3(self): result = PageRank.search("shit") self.assertNotEqual(len(result),0,"fail")
print("Loading tf-idf") tfidf = calculateTFIDF(invertedIndex, maxCount, N) # doIndexingTfIDF(tfidf) # doIndexingPreprocessedWord() # doIndexingDocuments() # doIndexingPR() # tfidf = fetchIndexingTfIDF(tfidf) # cleanWords = fetchIndexingPreprocessedWord() # docCounterList = fetchIndexingDocuments() # pageranks = fetchIndexingPR() # calculate pagerank print("Loading pagerank") pageranks = pr.compute(linksDocs) # load any embeddings of choice print("Loading word embeddings") embeddings_dict = utils.load_glove() # embeddings_dict = gensim.models.KeyedVectors.load_word2vec_format('Embeddings\GoogleNews-vectors-negative300.bin', binary=True) #list of predefined queries queries = [ "Professors who teach NLP at UIC", "Student organizations at uic", "Student Orientation at UIC", "How has coronavirus affected UIC", "Centers for Cultural Understanding and Social Change" ] # press enter to validate pre-defined queries # otherwise input your query query = input(
def testmethod_5(self): result = PageRank.search("") self.assertEqual(len(result), 1, "fail")
def main(): print("Starting AUROC..") #Get file path choices pathToPPINetworkFile = sys.argv[1] #pathToPPINetworkFile = 'Data/9606.protein.links.v11.0.txt' # Get output vectors from each algorithm PPI_Network = compute_if_not_cached(loader.load_graph, pathToPPINetworkFile, fileName=pathToPPINetworkFile) ground_truth_files = [ 'Data/MalaCard-protein-Endometriosis.diseasegenes.tsv', 'Data/MalaCard-protein-ischaemic-stroke.diseasegenes.tsv', 'Data/MalaCard-protein-lymphoma.diseasegenes.tsv' ] file_paths = [ 'Data/endometriosis-proteins.diseasegenes.tsv', 'Data/lymphoma-proteins.diseasegenes.tsv', 'Data/ischaemic-proteins.diseasegenes.tsv' ] prior_paths = [ 'Data/endometriosis-proteins.priors.tsv', 'Data/lymphoma-proteins.priors.tsv', 'Data/ischaemic-proteins.priors.tsv' ] names = ['endometriosis', 'lymphoma', 'ischaemic'] for i in range(1, 3): # building ground truth ground_truth_vec = [] with open(ground_truth_files[i], 'r') as input_file: input_file = input_file.readlines() for line in input_file: protein = line.rstrip('\n') ground_truth_vec.append(protein) gene_file = open(file_paths[i], 'r') file_contents = list(gene_file.readlines()) # print(file_contents) for line in file_contents: protein = line.rstrip('\n') if protein not in ground_truth_vec: ground_truth_vec.append(protein) gene_file.close() print(ground_truth_vec) # building start and priors vector start_vector = loader.load_start_vector(file_paths[i], PPI_Network) priors_vector = pr.load_priors(prior_paths[i], PPI_Network) #getting output from algorithms start_time = time.time() output_RWR = rwr.random_walk(PPI_Network, start_vector) end_time = time.time() print("time for rwr:", end_time - start_time) start_time = time.time() output_PR = pr.page_rank(PPI_Network, start_vector, priors_vector) end_time = time.time() print("time for pr:", end_time - start_time) start_time = time.time() output_DK = dk.diffusion_kernel(PPI_Network, start_vector) end_time = time.time() print("time for dk:", end_time - start_time) #building roc curves start_time = time.time() name = "rwr-" + names[i] rwr_curve = roc_curve(output_RWR, ground_truth_vec, name) end_time = time.time() print("time for roc curve, rwr:", end_time - start_time) start_time = time.time() name = "pr-" + names[i] pr_curve = roc_curve(output_PR, ground_truth_vec, name) end_time = time.time() print("time for roc curve, pr:", end_time - start_time) start_time = time.time() start_time = time.time() name = "dk-" + names[i] dk_curve = roc_curve(output_DK, ground_truth_vec, name) end_time = time.time() print("time for roc curve, dk:", end_time - start_time) file_path = 'Results/' + names[i] + 'roc_curve.png' plt.ylabel('TPR') plt.xlabel('FPR') plt.title(names[i]) plt.legend(loc='lower right') plt.savefig(file_path) #moved from roc_curve plt.clf() #moved from roc_curve print("Plots have been saved as png files in the Results folder.")