def runFootball(): """run pagerank on NCAA_football.csv""" f = open('NCAA_football.csv') graph = PageRank() for line in f: columns = line.split(',') team1 = columns[0].strip() value1 = int(columns[1]) team2 = columns[2].strip() value2 = int(columns[3]) if value1 > value2: graph.addEdge(team2, team1) elif value1 < value2: graph.addEdge(team1, team2) else: graph.addEdge(team2, team1) graph.addEdge(team1, team2) graph.printGraph() iterations, ranks = graph.getPageRank() print "Number of iterations:", iterations print returnSorted(ranks)
def test_calculate_page_rank(self): pagerank = PageRank(self.graph, 0.85, 0.0001) pagerank_dict = pagerank.run() for k, v in pagerank_dict.items(): print k, v assert True
def main(): google_file = "web-Google.txt" simple_test = "web-Matvii.txt" sparse_matrix = create_sparse_matrix(simple_test) pr = PageRank(sparse_matrix) pr.init_weights() print("Initial weights\n", pr.weights) pr.calculate_page_rank(5) print("Weights after 5 iterations\n", pr.weights)
def test_pagerank(self): M = mat([[0, 1 / 2, 0, 0], [1 / 3, 0, 0, 1 / 2], [1 / 3, 0, 1, 1 / 2], [ 1 / 3, 1 / 2, 0, 0, ]]) R = array([1 / 4, 1 / 4, 1 / 4, 1 / 4]).reshape(-1, 1) pr = PageRank(M, R, damping=0.8, max_iter=100) R = pr.fit() R_true = array([15 / 148, 19 / 148, 95 / 148, 19 / 148]).reshape(-1, 1) assert mean(R - R_true) < 1e-3
def main(args): summarizer = { 'tfidf': TfIdf(), 'cluster': Cluster(), 'svd': SVD(), 'pagerank': PageRank() }[args['alg']] summarizer.initialize(args['tf'], args['df']) summary = summarizer.summarize(args['doc']) for s in summary: print(s),
def calculateSolution(dswa, method, gt_solution): # BaselineApproach if method == 'pagerank': pagerank_approach = PageRank(dswa) print("Calculation solution...") calculated_solution = pagerank_approach.returnSolution(5) #UnsupevisedApproach elif method == 'unsupervised': unsupervisedApproach = Unsupervised_Approach(dswa) print("Calculation solution...") calculated_solution = unsupervisedApproach.returnSolution() print('--- Solution ---') print(calculated_solution) return calculated_solution
def runStates(): """run pagerank on stateborders.csv""" f = open('stateborders.csv') graph = PageRank() for line in f: columns = line.split(',') left = columns[0].strip('"') right = columns[2].strip('"') graph.addEdge(left, right) graph.printGraph() iterations, ranks = graph.getPageRank() print "Number of iterations:", iterations print returnSorted(ranks)
def rank_author(): ac_net = AuthorCitationNetwork() ac_net_m, ac_net_list = ac_net.make_matrix() print("Caculate pagerank...") ac_net_pgr = PageRank(ac_net_m) ac_net_pr = ac_net_pgr.caculate("author_iter.txt", m_d=1e-5) ac_net_results = list(zip(ac_net_list, ac_net_pr.tolist())) ac_net_results.sort(key=itemgetter(1), reverse=True) results = [] for result in ac_net_results: results.append(str(result[0][1]) + " " + str(result[1][0]) + "\n") # results.sort(key=itemgetter(2),reverse=True) print("Writing results...") f = open("2014/results/author.txt", 'w') # f.writelines(str(ac_net_results)) f.writelines(str(results))
def rank_paper(): pp_net = PaperCitationNetwork() pp_net_m, pp_net_list = pp_net.make_matrix() print("Caculating pagerank...") pp_net_pgr = PageRank(pp_net_m) pp_net_pr = pp_net_pgr.caculate("paper_iter.txt", m_d=1e-7) pp_net_results = list(zip(pp_net_list, pp_net_pr.tolist())) pp_net_results.sort(key=itemgetter(1)) results = [] for result in pp_net_results: results.append(str(result[0][1]) + " " + str(result[1][0]) + "\n") # results.sort(key=lambda x:-1*x[2]) # printer=[] # for r in results: # printer.append(str(r[1])+" "+str(r[2])+"\n") print("Writing results...") f = open("2014/results/paper.txt", 'w') f.writelines(str(results))
def rank_venue(): vn_net = VenueCitationNetwork() vn_net_m, vn_net_list = vn_net.make_matrix() print("Caculating pagerank...") vn_net_pgr = PageRank(vn_net_m) vn_net_pr = vn_net_pgr.caculate("venue_iter.txt", m_d=1e-7) vn_list = [] for vn in vn_net_list: vn_list.append(vn[0:2]) vn_net_results = list(zip(vn_list, vn_net_pr.tolist())) vn_net_results.sort(key=itemgetter(1), reverse=True) # ac_net_results.sort(key=lambda x:-1*x[1]) results = [] for result in vn_net_results: results.append( str(result[0][0]) + " " + str(result[0][1]) + " " + str(result[1][0]) + "\n") f = open("2014/results/venue.txt", 'w') # f.writelines(str(vn_net_results)) f.writelines(results) print("Writing results...")
def sample_generation(args): # Preprocessing Step print("Numpy Version Check") print(np.__version__) print("Scipy Version Check") print(scipy.__version__) data_dicts = preprocessing(transition_matrix_path=args.transition_matrix, doc_topics_path=args.document_topic, user_topic_path=args.user_topic_interest, query_topic_path=args.query_topic_relation, search_relevance_path=args.search_relevance) # GPR, PTSPR, QTSPR construction if args.pagerank == "gpr": pr = PageRank(trans_matrix=data_dicts['transition_matrix'], dampening_factor=args.dampening_factor) elif args.pagerank == "ptspr" or args.pagerank == "qtspr": pr = TopicSensitivePageRank( trans_matrix=data_dicts['transition_matrix'], topic_matrix=data_dicts['doc_topic_matrix'], dampening_factor=args.dampening_factor, topic_factor=args.topic_factor) pr.converge() if args.pagerank == "gpr": np.savetxt("GPR.txt", pr.ranked_vector, delimiter=" ") elif args.pagerank == "ptspr": topic_prob = data_dicts['user_topic_probs']["2-1"] vector = (pr.ranked_matrix * topic_prob.reshape(12, 1)).view( np.ndarray).squeeze() np.savetxt("QTSPR-U2Q1.txt", vector, delimiter=" ") elif args.pagerank == "qtspr": topic_prob = data_dicts['query_topic_probs']["2-1"] vector = (pr.ranked_matrix * topic_prob.reshape(12, 1)).view( np.ndarray).squeeze() np.savetxt("PTSPR-U2Q1.txt", vector, delimiter=" ") print("===================== END =====================")
def __init__(self): self.db = Datasource() self.pagerank = PageRank()
def __init__(self): self.pagerank = PageRank() self.ranking = None self.res = None lista = list(self.pagerank.autores) lista.sort() self.AutorList = lista self.raiz = Tk() self.raiz.geometry('950x500') self.raiz.title('Buscador') # Celda donde se muestran los resultados. self.tinfo = scrolledtext.ScrolledText(self.raiz, width=50, height=30) self.tinfo.grid(column = 0, row = 6) # Celda donde se introduce la búsqueda. self.tentry = Entry(self.raiz, width=40) self.tentry.grid(column = 0, row = 5) # Botón de buscar. self.binfo = ttk.Button(self.raiz, text='Buscar', command=self.verinfo) self.binfo.grid(column = 1, row =5) # Botón de búsqueda personalizada. self.bper = ttk.Button(self.raiz, text='Búsqueda personalizada', command=self.verper) self.bper.grid(column = 1, row =3) # Botón de mostrar Ranking inicial. self.bpag = ttk.Button(self.raiz, text='Mostrar Ranking inicial', command=self.verpag) self.bpag.grid(column = 2, row =3) # Botón de salir. self.bsalir = ttk.Button(self.raiz, text='Salir', command=self.raiz.destroy) self.bsalir.grid(column = 2, row = 5) # Desplegable para elegir el sitio de búsqueda. self.variable = tk.StringVar(self.raiz) self.variable.set(self.OptionList[0]) opt = tk.OptionMenu(self.raiz, self.variable, *self.OptionList) opt.config(width=30, font=('Helvetica', 12)) opt.grid(column = 0, row = 1) # Desplegable para elegir el autor. self.variable2 = tk.StringVar(self.raiz) self.variable2.set(self.AutorList[0]) opt2 = ttk.Combobox(self.raiz, textvariable = self.variable2, values = self.AutorList) opt2.config(width=30, font=('Helvetica', 12)) opt2.grid(column = 0, row = 3) # Desplegable para elegir el modelo. self.variable3 = tk.StringVar(self.raiz) self.variable3.set(self.OptionMod[0]) opt3 = tk.OptionMenu(self.raiz, self.variable3, *self.OptionMod) opt3.config(width=30, font=('Helvetica', 12)) opt3.grid(column = 2, row = 1) # Desplegable para elegir si mostrar los pesos. self.variable4 = tk.StringVar(self.raiz) self.variable4.set(self.OptionRank[0]) opt3 = tk.OptionMenu(self.raiz, self.variable4, *self.OptionRank) opt3.config(width=30, font=('Helvetica', 12)) opt3.grid(column = 1, row = 1) # Muestra la opción del desplegable del sitio de búsqueda. self.labelTest = tk.Label(text="", font=('Helvetica', 12), fg='red') self.labelTest.grid(column = 0, row = 2) self.variable.trace("w", self.callback) # Muestra la opción del desplegable del autor. self.labelTest2 = tk.Label(text="", font=('Helvetica', 12), fg='red') self.labelTest2.grid(column = 0, row = 4) self.variable2.trace("w", self.callback2) # Muestra la opción del desplegable del modelo. self.labelTest3 = tk.Label(text="", font=('Helvetica', 12), fg='red') self.labelTest3.grid(column = 2, row = 2) self.variable3.trace("w", self.callback3) # Muestra la opción elegida de mostrar los pesos. self.labelTest4 = tk.Label(text="", font=('Helvetica', 12), fg='red') self.labelTest4.grid(column = 1, row = 2) self.variable4.trace("w", self.callback4) self.tentry.focus_set() self.raiz.mainloop()
def search_init(self): jieba.initialize() self.pagerank = PageRank()
def main(args): # Preprocessing Step data_dicts = preprocessing(transition_matrix_path=args.transition_matrix, doc_topics_path=args.document_topic, user_topic_path=args.user_topic_interest, query_topic_path=args.query_topic_relation, search_relevance_path=args.search_relevance) # GPR, PTSPR, QTSPR construction if args.pagerank == "gpr": pr = PageRank(trans_matrix=data_dicts['transition_matrix'], dampening_factor=args.dampening_factor) elif args.pagerank == "ptspr" or args.pagerank == "qtspr": pr = TopicSensitivePageRank( trans_matrix=data_dicts['transition_matrix'], topic_matrix=data_dicts['doc_topic_matrix'], dampening_factor=args.dampening_factor, topic_factor=args.topic_factor) pr_start = time.time() pr.converge() pr_end = time.time() print("Power iteration - {} required time: {:.3f}seconds".format( args.pagerank, pr_end - pr_start)) pr_result = [] for query_ID in data_dicts['search_relevance_score'].keys(): candidate_indices, retrieval_scores = data_dicts[ 'search_relevance_score'][query_ID] user_topic_prob = data_dicts['user_topic_probs'][query_ID] query_topic_prob = data_dicts['query_topic_probs'][query_ID] if args.pagerank == "gpr": pr_indices, pr_scores = pr.ranking(candidate_indices, retrieval_scores, criterion=args.criterion) elif args.pagerank == "ptspr": pr_indices, pr_scores = pr.ranking(candidate_indices, retrieval_scores, user_topic_prob, criterion=args.criterion) elif args.pagerank == "qtspr": pr_indices, pr_scores = pr.ranking(candidate_indices, retrieval_scores, query_topic_prob, criterion=args.criterion) for idx in range(len(candidate_indices)): # Print function temp = [[]] temp[0].append(query_ID) temp[0].append("Q0") temp[0].append(str(pr_indices[idx] + 1)) temp[0].append(str(idx + 1)) temp[0].append(str(pr_scores[idx])) temp[0].append(args.cfg) pr_str = " ".join(temp[0]) pr_result.append(pr_str) pr_result_text = "\n".join(pr_result) with open(args.pagerank + "_" + args.cfg + ".txt", "w") as f: f.write(pr_result_text) pr_end = time.time() print("total {} required time : {:.3f}seconds".format( args.pagerank, pr_end - pr_start)) print("===================== END =====================")
def __init__(self): util.log("Loading data set...") self.newsgroup_data = fetch_20newsgroups(remove=('headers', 'footers')) # print(data_set.target.shape) # categories per document # print(data_set.filenames.shape) # filenames per document for doc in self.newsgroup_data.data: if len(doc) < 5: self.newsgroup_data.data.remove(doc) self.newsgroup_frame = pd.DataFrame.from_dict( {'text': self.newsgroup_data.data}) #f = self.newsgroup_frame.text.str.contains('National Rifle Association') #ids = np.arange(len(self.newsgroup_frame))[f] #self.list_docs(ids) #return self.tfidf_matrix = TfIdfMatrix.from_data_set(self.newsgroup_data.data) self.inverted_index = InvertedIndex.from_tf_idf_matrix( self.tfidf_matrix) util.log("Clustering...") self.kmeans = KMeans(tfidf=self.tfidf_matrix.get_matrix(), k=100, max_iterations=30, random_initial=False) try: self.kmeans.load_cluster_vector('cluster_vector.pkl') except FileNotFoundError: self.kmeans.do_magic() self.kmeans.store_cluster_vector('cluster_vector.pkl') util.log("Finished.") r = self.kmeans.vector.ravel() u = np.unique(self.kmeans.vector) print(u) try: self.adjacency_matrix = pkl.load(open('adjacency_matrix.pkl', "rb")) except FileNotFoundError: self.adjacency_matrix = AdjacencyMatrix.from_cluster_and_tf_idf_matrix( r, self.tfidf_matrix) with open('adjacency_matrix.pkl', 'wb') as f: pkl.dump(self.adjacency_matrix, f) try: pr = PageRank(pickle='pr.pkl') except FileNotFoundError: util.log("No precomputed PageRank...") util.log("Calculating PR...") pr = PageRank(adjacency_matrix=self.adjacency_matrix.get_matrix(), alpha=0.85, converge=0.00001) util.log("Finished PR") pr.store_rank_vector('pr.pkl') self.pr_vector = pr.get_pagerank(normalized=True)