예제 #1
0
파일: main.py 프로젝트: Arkkraa/466-project
def runFootball():
   """run pagerank on NCAA_football.csv"""

   f = open('NCAA_football.csv')
   graph = PageRank()

   for line in f:
      columns = line.split(',')
      team1 = columns[0].strip()
      value1 = int(columns[1])
      team2 = columns[2].strip()
      value2 = int(columns[3])

      if value1 > value2:
         graph.addEdge(team2, team1)

      elif value1 < value2:
         graph.addEdge(team1, team2)

      else:
         graph.addEdge(team2, team1)
         graph.addEdge(team1, team2)

   graph.printGraph()
   iterations, ranks = graph.getPageRank()
   print "Number of iterations:", iterations
   print returnSorted(ranks)
예제 #2
0
    def test_calculate_page_rank(self):

        pagerank = PageRank(self.graph, 0.85, 0.0001)
        pagerank_dict = pagerank.run()

        for k, v in pagerank_dict.items():
            print k, v

        assert True
예제 #3
0
def main():
    google_file = "web-Google.txt"
    simple_test = "web-Matvii.txt"
    sparse_matrix = create_sparse_matrix(simple_test)
    pr = PageRank(sparse_matrix)
    pr.init_weights()
    print("Initial weights\n", pr.weights)
    pr.calculate_page_rank(5)
    print("Weights after 5 iterations\n", pr.weights)
예제 #4
0
 def test_pagerank(self):
     M = mat([[0, 1 / 2, 0, 0], [1 / 3, 0, 0, 1 / 2], [1 / 3, 0, 1, 1 / 2],
              [
                  1 / 3,
                  1 / 2,
                  0,
                  0,
              ]])
     R = array([1 / 4, 1 / 4, 1 / 4, 1 / 4]).reshape(-1, 1)
     pr = PageRank(M, R, damping=0.8, max_iter=100)
     R = pr.fit()
     R_true = array([15 / 148, 19 / 148, 95 / 148, 19 / 148]).reshape(-1, 1)
     assert mean(R - R_true) < 1e-3
예제 #5
0
def main(args):
    summarizer = {
        'tfidf': TfIdf(),
        'cluster': Cluster(),
        'svd': SVD(),
        'pagerank': PageRank()
    }[args['alg']]

    summarizer.initialize(args['tf'], args['df'])
    summary = summarizer.summarize(args['doc'])

    for s in summary:
        print(s),
예제 #6
0
def calculateSolution(dswa, method, gt_solution):
    # BaselineApproach
    if method == 'pagerank':
        pagerank_approach = PageRank(dswa)
        print("Calculation solution...")
        calculated_solution = pagerank_approach.returnSolution(5)
    #UnsupevisedApproach
    elif method == 'unsupervised':
        unsupervisedApproach = Unsupervised_Approach(dswa)
        print("Calculation solution...")
        calculated_solution = unsupervisedApproach.returnSolution()

    print('--- Solution ---')
    print(calculated_solution)
    return calculated_solution
예제 #7
0
파일: main.py 프로젝트: Arkkraa/466-project
def runStates():
   """run pagerank on stateborders.csv"""
   f = open('stateborders.csv')

   graph = PageRank()
   for line in f:
      columns = line.split(',')
      left = columns[0].strip('"')
      right = columns[2].strip('"')
      graph.addEdge(left, right)

   graph.printGraph()
   iterations, ranks = graph.getPageRank()
   print "Number of iterations:", iterations
   print returnSorted(ranks)
예제 #8
0
def rank_author():
    ac_net = AuthorCitationNetwork()
    ac_net_m, ac_net_list = ac_net.make_matrix()
    print("Caculate pagerank...")
    ac_net_pgr = PageRank(ac_net_m)
    ac_net_pr = ac_net_pgr.caculate("author_iter.txt", m_d=1e-5)
    ac_net_results = list(zip(ac_net_list, ac_net_pr.tolist()))
    ac_net_results.sort(key=itemgetter(1), reverse=True)
    results = []
    for result in ac_net_results:
        results.append(str(result[0][1]) + "  " + str(result[1][0]) + "\n")
    # results.sort(key=itemgetter(2),reverse=True)
    print("Writing results...")
    f = open("2014/results/author.txt", 'w')
    # f.writelines(str(ac_net_results))
    f.writelines(str(results))
예제 #9
0
def rank_paper():
    pp_net = PaperCitationNetwork()
    pp_net_m, pp_net_list = pp_net.make_matrix()
    print("Caculating pagerank...")
    pp_net_pgr = PageRank(pp_net_m)
    pp_net_pr = pp_net_pgr.caculate("paper_iter.txt", m_d=1e-7)
    pp_net_results = list(zip(pp_net_list, pp_net_pr.tolist()))
    pp_net_results.sort(key=itemgetter(1))
    results = []
    for result in pp_net_results:
        results.append(str(result[0][1]) + "  " + str(result[1][0]) + "\n")
    # results.sort(key=lambda x:-1*x[2])
    # printer=[]
    # for r in results:
    #     printer.append(str(r[1])+"  "+str(r[2])+"\n")
    print("Writing results...")
    f = open("2014/results/paper.txt", 'w')
    f.writelines(str(results))
예제 #10
0
def rank_venue():
    vn_net = VenueCitationNetwork()
    vn_net_m, vn_net_list = vn_net.make_matrix()
    print("Caculating pagerank...")
    vn_net_pgr = PageRank(vn_net_m)
    vn_net_pr = vn_net_pgr.caculate("venue_iter.txt", m_d=1e-7)
    vn_list = []
    for vn in vn_net_list:
        vn_list.append(vn[0:2])
    vn_net_results = list(zip(vn_list, vn_net_pr.tolist()))
    vn_net_results.sort(key=itemgetter(1), reverse=True)
    # ac_net_results.sort(key=lambda x:-1*x[1])
    results = []
    for result in vn_net_results:
        results.append(
            str(result[0][0]) + "  " + str(result[0][1]) + "   " +
            str(result[1][0]) + "\n")
    f = open("2014/results/venue.txt", 'w')
    # f.writelines(str(vn_net_results))
    f.writelines(results)
    print("Writing results...")
def sample_generation(args):
    # Preprocessing Step
    print("Numpy Version Check")
    print(np.__version__)
    print("Scipy Version Check")
    print(scipy.__version__)
    data_dicts = preprocessing(transition_matrix_path=args.transition_matrix,
                               doc_topics_path=args.document_topic,
                               user_topic_path=args.user_topic_interest,
                               query_topic_path=args.query_topic_relation,
                               search_relevance_path=args.search_relevance)

    # GPR, PTSPR, QTSPR construction
    if args.pagerank == "gpr":
        pr = PageRank(trans_matrix=data_dicts['transition_matrix'],
                      dampening_factor=args.dampening_factor)
    elif args.pagerank == "ptspr" or args.pagerank == "qtspr":
        pr = TopicSensitivePageRank(
            trans_matrix=data_dicts['transition_matrix'],
            topic_matrix=data_dicts['doc_topic_matrix'],
            dampening_factor=args.dampening_factor,
            topic_factor=args.topic_factor)

    pr.converge()

    if args.pagerank == "gpr":
        np.savetxt("GPR.txt", pr.ranked_vector, delimiter=" ")
    elif args.pagerank == "ptspr":
        topic_prob = data_dicts['user_topic_probs']["2-1"]
        vector = (pr.ranked_matrix * topic_prob.reshape(12, 1)).view(
            np.ndarray).squeeze()
        np.savetxt("QTSPR-U2Q1.txt", vector, delimiter=" ")
    elif args.pagerank == "qtspr":
        topic_prob = data_dicts['query_topic_probs']["2-1"]
        vector = (pr.ranked_matrix * topic_prob.reshape(12, 1)).view(
            np.ndarray).squeeze()
        np.savetxt("PTSPR-U2Q1.txt", vector, delimiter=" ")
    print("===================== END =====================")
 def __init__(self):
     self.db = Datasource()
     self.pagerank = PageRank()
예제 #13
0
    def __init__(self):
        self.pagerank = PageRank()
        self.ranking = None
        self.res = None

        lista = list(self.pagerank.autores)
        lista.sort()
        self.AutorList = lista

        self.raiz = Tk()
        self.raiz.geometry('950x500')

        self.raiz.title('Buscador')

        # Celda donde se muestran los resultados.
        self.tinfo = scrolledtext.ScrolledText(self.raiz, width=50, height=30)
        self.tinfo.grid(column = 0, row = 6)

        # Celda donde se introduce la búsqueda.
        self.tentry = Entry(self.raiz, width=40)
        self.tentry.grid(column = 0, row = 5)

        # Botón de buscar.
        self.binfo = ttk.Button(self.raiz, text='Buscar',                      command=self.verinfo)
        self.binfo.grid(column = 1, row =5)

        # Botón de búsqueda personalizada.
        self.bper = ttk.Button(self.raiz, text='Búsqueda personalizada',        command=self.verper)
        self.bper.grid(column = 1, row =3)

        # Botón de mostrar Ranking inicial.
        self.bpag = ttk.Button(self.raiz, text='Mostrar Ranking inicial',        command=self.verpag)
        self.bpag.grid(column = 2, row =3)

        # Botón de salir.
        self.bsalir = ttk.Button(self.raiz, text='Salir',
                                 command=self.raiz.destroy)
        self.bsalir.grid(column = 2, row = 5)

        # Desplegable para elegir el sitio de búsqueda.
        self.variable = tk.StringVar(self.raiz)
        self.variable.set(self.OptionList[0])
        opt = tk.OptionMenu(self.raiz, self.variable, *self.OptionList)
        opt.config(width=30, font=('Helvetica', 12))
        opt.grid(column = 0, row = 1)

        # Desplegable para elegir el autor.
        self.variable2 = tk.StringVar(self.raiz)
        self.variable2.set(self.AutorList[0])
        opt2 = ttk.Combobox(self.raiz, textvariable = self.variable2, values = self.AutorList)
        opt2.config(width=30, font=('Helvetica', 12))
        opt2.grid(column = 0, row = 3)

        # Desplegable para elegir el modelo.
        self.variable3 = tk.StringVar(self.raiz)
        self.variable3.set(self.OptionMod[0])
        opt3 = tk.OptionMenu(self.raiz, self.variable3, *self.OptionMod)
        opt3.config(width=30, font=('Helvetica', 12))
        opt3.grid(column = 2, row = 1)

        # Desplegable para elegir si mostrar los pesos.
        self.variable4 = tk.StringVar(self.raiz)
        self.variable4.set(self.OptionRank[0])
        opt3 = tk.OptionMenu(self.raiz, self.variable4, *self.OptionRank)
        opt3.config(width=30, font=('Helvetica', 12))
        opt3.grid(column = 1, row = 1)

        # Muestra la opción del desplegable del sitio de búsqueda.
        self.labelTest = tk.Label(text="", font=('Helvetica', 12), fg='red')
        self.labelTest.grid(column = 0, row = 2)
        self.variable.trace("w", self.callback)

        # Muestra la opción del desplegable del autor.
        self.labelTest2 = tk.Label(text="", font=('Helvetica', 12), fg='red')
        self.labelTest2.grid(column = 0, row = 4)
        self.variable2.trace("w", self.callback2)

        # Muestra la opción del desplegable del modelo.
        self.labelTest3 = tk.Label(text="", font=('Helvetica', 12), fg='red')
        self.labelTest3.grid(column = 2, row = 2)
        self.variable3.trace("w", self.callback3)

        # Muestra la opción elegida de mostrar los pesos.
        self.labelTest4 = tk.Label(text="", font=('Helvetica', 12), fg='red')
        self.labelTest4.grid(column = 1, row = 2)
        self.variable4.trace("w", self.callback4)

        self.tentry.focus_set()
        self.raiz.mainloop()
예제 #14
0
 def search_init(self):
     jieba.initialize()
     self.pagerank = PageRank()
예제 #15
0
def main(args):
    # Preprocessing Step
    data_dicts = preprocessing(transition_matrix_path=args.transition_matrix,
                               doc_topics_path=args.document_topic,
                               user_topic_path=args.user_topic_interest,
                               query_topic_path=args.query_topic_relation,
                               search_relevance_path=args.search_relevance)

    # GPR, PTSPR, QTSPR construction
    if args.pagerank == "gpr":
        pr = PageRank(trans_matrix=data_dicts['transition_matrix'],
                      dampening_factor=args.dampening_factor)
    elif args.pagerank == "ptspr" or args.pagerank == "qtspr":
        pr = TopicSensitivePageRank(
            trans_matrix=data_dicts['transition_matrix'],
            topic_matrix=data_dicts['doc_topic_matrix'],
            dampening_factor=args.dampening_factor,
            topic_factor=args.topic_factor)

    pr_start = time.time()
    pr.converge()
    pr_end = time.time()
    print("Power iteration - {} required time: {:.3f}seconds".format(
        args.pagerank, pr_end - pr_start))

    pr_result = []
    for query_ID in data_dicts['search_relevance_score'].keys():
        candidate_indices, retrieval_scores = data_dicts[
            'search_relevance_score'][query_ID]
        user_topic_prob = data_dicts['user_topic_probs'][query_ID]
        query_topic_prob = data_dicts['query_topic_probs'][query_ID]

        if args.pagerank == "gpr":
            pr_indices, pr_scores = pr.ranking(candidate_indices,
                                               retrieval_scores,
                                               criterion=args.criterion)
        elif args.pagerank == "ptspr":
            pr_indices, pr_scores = pr.ranking(candidate_indices,
                                               retrieval_scores,
                                               user_topic_prob,
                                               criterion=args.criterion)
        elif args.pagerank == "qtspr":
            pr_indices, pr_scores = pr.ranking(candidate_indices,
                                               retrieval_scores,
                                               query_topic_prob,
                                               criterion=args.criterion)

        for idx in range(len(candidate_indices)):
            # Print function
            temp = [[]]
            temp[0].append(query_ID)
            temp[0].append("Q0")
            temp[0].append(str(pr_indices[idx] + 1))
            temp[0].append(str(idx + 1))
            temp[0].append(str(pr_scores[idx]))
            temp[0].append(args.cfg)
            pr_str = " ".join(temp[0])
            pr_result.append(pr_str)

    pr_result_text = "\n".join(pr_result)

    with open(args.pagerank + "_" + args.cfg + ".txt", "w") as f:
        f.write(pr_result_text)

    pr_end = time.time()
    print("total {} required time : {:.3f}seconds".format(
        args.pagerank, pr_end - pr_start))
    print("===================== END =====================")
예제 #16
0
    def __init__(self):
        util.log("Loading data set...")
        self.newsgroup_data = fetch_20newsgroups(remove=('headers', 'footers'))

        # print(data_set.target.shape)  # categories per document
        # print(data_set.filenames.shape)  # filenames per document

        for doc in self.newsgroup_data.data:
            if len(doc) < 5:
                self.newsgroup_data.data.remove(doc)

        self.newsgroup_frame = pd.DataFrame.from_dict(
            {'text': self.newsgroup_data.data})

        #f = self.newsgroup_frame.text.str.contains('National Rifle Association')

        #ids = np.arange(len(self.newsgroup_frame))[f]
        #self.list_docs(ids)
        #return

        self.tfidf_matrix = TfIdfMatrix.from_data_set(self.newsgroup_data.data)

        self.inverted_index = InvertedIndex.from_tf_idf_matrix(
            self.tfidf_matrix)

        util.log("Clustering...")
        self.kmeans = KMeans(tfidf=self.tfidf_matrix.get_matrix(),
                             k=100,
                             max_iterations=30,
                             random_initial=False)

        try:
            self.kmeans.load_cluster_vector('cluster_vector.pkl')
        except FileNotFoundError:
            self.kmeans.do_magic()
            self.kmeans.store_cluster_vector('cluster_vector.pkl')

        util.log("Finished.")

        r = self.kmeans.vector.ravel()
        u = np.unique(self.kmeans.vector)
        print(u)
        try:
            self.adjacency_matrix = pkl.load(open('adjacency_matrix.pkl',
                                                  "rb"))
        except FileNotFoundError:
            self.adjacency_matrix = AdjacencyMatrix.from_cluster_and_tf_idf_matrix(
                r, self.tfidf_matrix)
            with open('adjacency_matrix.pkl', 'wb') as f:
                pkl.dump(self.adjacency_matrix, f)
        try:
            pr = PageRank(pickle='pr.pkl')
        except FileNotFoundError:
            util.log("No precomputed PageRank...")
            util.log("Calculating PR...")
            pr = PageRank(adjacency_matrix=self.adjacency_matrix.get_matrix(),
                          alpha=0.85,
                          converge=0.00001)
        util.log("Finished PR")
        pr.store_rank_vector('pr.pkl')

        self.pr_vector = pr.get_pagerank(normalized=True)