Пример #1
0
 def testmethod_4(self):
     dic = PageRank.relative_font_size("youtube")
     result = PageRank.search("youtube")
     if result == [-1]:
         self.assertNotEqual(len(dic), 1, "fail")
     else:
         self.assertNotEqual(len(dic), 0, "fail")
Пример #2
0
 def testmethod_1(self):
     dic = PageRank.hits("zhu")
     result = PageRank.search("zhu")
     if result == [-1]:
         self.assertNotEqual(len(dic), 1, "fail")
     else:
         self.assertNotEqual(len(dic), 0, "fail")
Пример #3
0
def test_PR():

    IGgraph1=PageRank.getIGraph("MATCH (n:`andra-user`)<-[r:`andra-from`]-(t1:`andra-tweet`)-[re:`andra-retweet`]->(t2:`andra-tweet`)-[rel:`andra-from`]->(p:`andra-user`) WHERE t2.language="'"fr"'" RETURN n as nodeFrom,p as nodeTo",\
            grapheNeo,"192.168.1.75:7474","neo4j", "pass4dbse")
    IGgraph2=PageRank.getIGraph("MATCH (n:`andra-tweet`)-[re:`andra-retweet`]->(p:`andra-tweet`) RETURN n as nodeFrom,p as nodeTo",\
            grapheNeo,"192.168.1.75:7474","neo4j", "pass4dbse")

    assert type(IGgraph1) == igraph.Graph
    assert len(IGgraph1.vs) != 0
    assert len(IGgraph1.es) != 0

    assert type(IGgraph2) == igraph.Graph
    assert len(IGgraph2.vs) != 0
    assert len(IGgraph2.es) != 0

    PRank1 = PageRank.Rank(IGgraph1, 10)
    PRank2 = PageRank.Rank(IGgraph2, 10)

    assert type(PRank1) == dict
    assert len(PRank1['classement']) == 10
    assert type(PRank1['classement']) == list
    assert type(PRank1['resPR']) == list
    assert len(PRank1['resPR']) != 0
    assert type(PRank2) == dict
    assert len(PRank1['classement']) == 10
    assert type(PRank2['classement']) == list
    assert type(PRank2['resPR']) == list
    assert len(PRank2['resPR']) != 0
    print "PAGERANK OK"
Пример #4
0
    def pesquisar(self):
        self.tela.wm_title('Googlis!')
        self.tela.wm_minsize(width=500, height=250)
        tkinter.Label(self.tela, text=self.texto,
                      font=self.fonte).pack(side='top')

        #####  IMAGEM
        img = tkinter.PhotoImage(file='googlis.png')
        logo = tkinter.Button(self.tela, image=img)
        logo.image = img
        logo.place(x=190, y=30)

        #####  BOTÕES
        busca = tkinter.Entry(self.tela)
        busca.place(x=150, y=150, width=200)

        botao_1 = tkinter.Button(
            self.tela,
            text='Pesquisa Googlis!',
            command=lambda: PageRank.PageRank(busca.get()).rank_it())

        botao_2 = tkinter.Button(self.tela, text='Estou com sorte!')

        botao_1.place(x=130, y=200)
        botao_2.place(x=270, y=200)
Пример #5
0
def main():
    data = read()
    M = data2matrix(data)
    v = PageRank.pagerank(M, 0.001, 0.85)
    funcDict = {}
    for i in xrange(counter):
        funcDict[id2func[i]] = v[i][0]
    funcDict = OrderedDict(sorted(funcDict.items(), key=lambda t: -t[1]))
    with open(os.path.join(resultFolder, resultFile), 'w') as pageRankFile:
        json.dump(funcDict, pageRankFile, indent=4)
Пример #6
0
class SearchEngine:

  def __init__(self):
    indexer      = Indexer()
    self.graph   = Graph()
    self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d06.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d08.html"},
                            self.graph, indexer)
    self.crawler.crawl()
    self.scorer = Scorer(indexer.index, indexer.documents)

    self.pageRank = PageRank(self.graph)
    self.pageRank.calc()


  def search (self, string, scoreOnly = False):
    query  = string.split()
    scores = self.scorer.scoreQuery(query)

    if scoreOnly:
      results = scores
    else:
      results = {}
      for url, score in scores.items():
        results[url] = score * self.graph.get_document(url).rank

    sortedResults = sorted(results.items(), key=operator.itemgetter(1), reverse = True)
    for res in sortedResults:
      print(res)

  def printPageRanks(self):
    print('Page ranks:')
    print('  d01  -   d02  -   d03  -   d04  -   d05  -   d06  -   d07  -   d08')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d01.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d02.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d03.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d04.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d05.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d06.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d07.html").rank, 4), end = ' - ')
    print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d08.html").rank, 4), end = '\n\n')
Пример #7
0
def results():

    if request.method == 'POST':
        MAX_RESULTS = 20
        searchQuery = request.form['SearchQuery']  #get text entered
        results = PageRank.getRelevantResults(searchQuery, Table_InvertedIndex,
                                              Table_Webpages, MAX_RESULTS)

        return render_template(
            "results.html",
            results=results)  #pass list of results for rendering
Пример #8
0
    def run(self, alpha, save_dir=""):
        pg = PageRank(self.itr, self.error, alpha)

        author_resvec = pg.run(self.author_mat, self.author_init,
                               self.author_len)
        self.author.ScoreToName("Author", author_resvec,
                                save_dir + "author_page_rank.txt")
        author_resmap = self.author.getResult()

        paper_resvec = pg.run(self.paper_mat, self.paper_init, self.paper_len)
        self.paper.ScoreToName("self.paper", paper_resvec,
                               save_dir + "paper_page_rank.txt")
        paper_resmap = self.paper.getResult()

        self.venue_resvec = pg.run(self.venue_mat, self.venue_init,
                                   self.venue_len)
        self.venue.ScoreToVenue(self.venue_resvec, save_dir + "venue_rank.txt")
        venue_resmap = self.venue.getResult()

        return author_resmap, paper_resmap, venue_resmap
Пример #9
0
 def run(self):
     self.output.config(state=NORMAL)
     self.output.delete(1.0, END)
     try:
         result, htmlPageNames = PageRank.PageRank(float(self.aEntry.get()), currentFile)
         for item in reversed(range(len(result))):
             self.output.insert('1.0', htmlPageNames[item] + ': ' + str(round(float(result[item]), 3)) + '\n')
         # self.output.insert('1.0', result)
         self.output.insert('1.0', 'Importance of pages:' + '\n')
     except:
         self.output.insert('1.0', 'Invalid input.')
     self.output.config(state=DISABLED)
Пример #10
0
def links(query):
    text=[]
    f=open('./results.txt','w')#wipes out results from previous query in order to prevent those results from adding to a new user query    
    f.write('<p style="font-family:FreeSans;font-size:120%;color:black">')
    f.close()#inserts a line of code that displays the text result in the format named FreeSans at 1.2 times the original size before closing the file.
    PageRank.search(query)#calls the query function in PageRank.py that computes and stores the results in "results.txt"
    f = open("./results.txt")
    f2=open('./numbers.txt')#contains the total number of results for the user's inputted query
    for line in f:#reads text file and copies contents onto a tuple (named 'text'), with each element of the tuple representing one line in the textfile.
	word = line.strip()
	text.append(word)
    f.close()
    for line in f2:#reads the number of results determined and saves it onto a vvariable named 'word2'.
	word2 = line.strip()
    f.close() 
    html = '<p style="font-family:FreeSans;font-size:120%;color:black">'+"Your query '"+query+"' returned the following "+word2+" result(s):<br/>"
    if(query.find(' ')==1):
	html=html+"NOTE: Since you typed MORE THAN ONE keyword, the following results may not be acurate.<br/>"
    html=html+"<br/>"
    html=html+"<br/>".join(text)    
    return html#displays results to the user
Пример #11
0
  def __init__(self):
    indexer      = Indexer()
    self.graph   = Graph()
    self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d06.html",
                            "http://mysql12.f4.htw-berlin.de/crawl/d08.html"},
                            self.graph, indexer)
    self.crawler.crawl()
    self.scorer = Scorer(indexer.index, indexer.documents)

    self.pageRank = PageRank(self.graph)
    self.pageRank.calc()
Пример #12
0
 def PageRank(self):
     #Specify parameters of Page Rank algorithm  
     iterations = 20
     initial_pr = 1.0
   
     if self.dataBaseConnection.cursor():
     
         #Fetch all links from persistent file, sort them using PageRank and store them into rankedList
         self.cursor.execute('SELECT * FROM Links;')
         myData = self.cursor.fetchall()
         rankedList = PageRank.page_rank(myData, iterations, initial_pr)
         
         for x in rankedList:
             self.cursor.execute( """INSERT OR REPLACE INTO PageRank (doc_id, rank)  VALUES('%s', '%s');""" %  ( x,  rankedList[x]) ) # Use INSERT OR REPLACE to prevent duplicate
             self.dataBaseConnection.commit() 
Пример #13
0
def experiment_lambda_rounds(airports_hash, airports_sink, results, init_function, init_type):
    for i in range(5, 10):
        lambda_value = i*0.1
        time1 = time.time()
        init_function(airports_hash)
        iterations = pr.compute_page_ranks(
            airports_hash=airports_hash, airports_sink=airports_sink, lambda_value=lambda_value, init_ranks=init_function)
        time2 = time.time()
        t = time2 - time1
        if init_type not in results:
            results[init_type] = {}
        lambda_str = '{:.1f}'.format(lambda_value)
        results[init_type][lambda_str] = {}
        results[init_type][lambda_str]['iterations'] = iterations
        results[init_type][lambda_str]['time'] = t
        results[init_type][lambda_str]['ranks'] = extract_results_airports(airports_hash)
Пример #14
0
    def __init__(self, layers=None, interLayers=None, weights=None):
        """
        Construct a Mulet with a list of individual layers.
        weights represent the inter layer edges between different layers.
        :return: None
        """
        if layers is not None:
            self.layers = layers
        if weights is not None:
            self.weights = weights
        if interLayers is not None:
            self.interLayers = interLayers
        self.getGenericGraphfromLayers()
        self.detCumulativeIntraLayerAcceptance()
        self.updateCumulativeAcceptance()

        self.detCumulativeInterLayerAcceptance()
        self.detCumulativeIntraLayerRejectance()
        self.updateCumulativeRejectance()

        self.detCumulativeInterLayerRejectance()
        finIp = InfluencePassivity(filename=None)

        finIp.InfluencePassivityAlgorithm(mygraph=self.g,
                                          Avals=self.A_delta,
                                          Rvals=self.R_delta)
        with open(
                '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Influences.json',
                'w') as outfile:
            json.dump(finIp.I, outfile)
        with open(
                '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Passivities.json',
                'w') as outfile:
            json.dump(finIp.P, outfile)

        print('Sum I:::', str(max((finIp.I.values()))))
        print('Sum P:::', str(max((finIp.P.values()))))

        pr = PageRank(directional=True)
        pr.modifyGraph(self.g)
        pr.pageRankAlgorithm(m=10)

        with open(
                '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Authority.json',
                'w') as outfile:
            json.dump(pr.a, outfile)
        with open(
                '/Users/rashmijrao/Documents/IP-master/A1/Scripts2/NS_Final/Hub.json',
                'w') as outfile:
            json.dump(pr.h, outfile)

        print('pagerank a:::', max(pr.a.values()))
        print('pagerank h:::', max(pr.h.values()))
Пример #15
0
 def testmethod_5(self):
     result = PageRank.trim_url("http://en.wikipedia.org/wiki/Hotmail") 
     self.assertEqual(result, "wikipedia.org")
Пример #16
0
     (authority, hubness) = HITS.HITS(graph)
     tEnd = time.time()
     timeCost = tEnd - tStart
     with open("output.txt", 'w', encoding='UTF-8') as f:
         f.write("time cost: %f\n" % timeCost)
         f.write("HITS\n")
         f.write("   authority, hubness\n")
         for key in graph:
             f.write("%s: " % key)
             f.write("%f, " % authority[key])
             f.write("%f\n" % hubness[key])
     f.close()
 elif method == 'pagerank':
     damp = 0.15
     tStart = time.time()
     pageRank = PageRank.PageRank(graph, damp)
     tEnd = time.time()
     timeCost = tEnd - tStart
     with open("output.txt", 'w', encoding='UTF-8') as f:
         f.write("time cost: %f\n" % timeCost)
         f.write("PageRank\n")
         for key in pageRank:
             f.write("%s: " % key)
             f.write("%f\n" % pageRank[key])
     f.close()
 elif method == 'simrank':
     c = 0.8  #decay factor
     tStart = time.time()
     simMatrix = SimRank.SimRank(graph, c)
     tEnd = time.time()
     timeCost = tEnd - tStart
Пример #17
0
import networkx as nx
import sys
sys.path.append('..')
import PageRank

G = nx.DiGraph()

G.add_nodes_from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

G.add_edge(1, 2)
G.add_edge(1, 4)
G.add_edge(1, 3)
G.add_edge(2, 3)
G.add_edge(4, 3)
G.add_edge(1, 5)
G.add_edge(2, 1)
G.add_edge(5, 2)
G.add_edge(3, 4)
G.add_edge(2, 4)

PR = PageRank.PageRank(G)
PR.constructDispersionMatrix(G)
print(PR.getPageRank())
print(sum(PR.getPageRank()))
Пример #18
0
def main():
    WebCrawler.main()
    Indexer.main()
    PageRank.main()
Пример #19
0
        return 1
    return count


for s in range(len(sentences)):
    if s > 0:
        adjMax[s][s - 1] += findRelation(sentences[s]) * 0
    for ss in range(len(sentences)):
        if s != ss:
            adjMax[s][ss] += findOverlap(sentences[s], sentences[ss])

# for m in adjMax:
#     print " ".join([str(i) for i in m])

G = np.array(adjMax)
rank = PageRank.pageRank(G, s=0.5)

l = []
ind = 0
for r in rank:
    l.append((r, ind))
    ind += 1

l.sort()
l.reverse()
print words_used
# print all inportant sentences
for k in l:
    print sentences[k[1]]
    print k[0]
    print "==============="
Пример #20
0
 def test_calc(self):
     self.assertTrue(P.get_weight())
     '''
Пример #21
0
import numpy as np

data = open(
    'E:\\learning\\WebDataMining\\hw\\aan\\release\\2014\\venue_test.txt')

# pattern of the venue
p1 = r"(?<=id\s=\s{).+?(?=})"
pattern1 = re.compile(p1)
p2 = r"(?<=venue\s=\s{).+?(?=})"
pattern2 = re.compile(p2)

# venue
venue = MapToMatrix('E:\\learning\\WebDataMining\\hw\\aan\\release\\2014\\')
venue.FileToList_Reg('venue_test.txt', pattern1, pattern2)
venue_len = venue.getLen()
venue_mat = venue.SwitchMap('paper_venue_test.txt')
venue_init = np.ones(venue_len)

pg = PageRank(50, 0.0001, 0.6)
venue_resvec = pg.run(venue_mat, venue_init, venue_len)

venue.ScoreToVenue(venue_resvec, "exp_venue_rank.txt")
venue_resmap = venue.getResult()
venue_topn = topN(3, venue_resmap)

sum_error = compareDiff(venue_topn, venue_topn)
print sum_error
# #
# alpha=0.6
# topn=10
# print "when alpha=%f, the top %d is \n"%(alpha,topn)
Пример #22
0
    def __init__(self,
                 fichier,
                 findParametre=False,
                 metrique="FMesure",
                 tailleTrain=0.65,
                 verbose=False):
        """
           Permet d'initialiser EvalAllIRModel
           
        :type fichier: String 
        :param fichier: le fichier ou sont stocker les queries, 
            
        :type findParametre: boolean
        :param findParametre: boolean pour activer l'optimisation des parametres
        
        :type metrique: String
        :param metrique: La metrique que l'on veux utiliser  
                
        :type tailleTrain: float
        :param tailleTrain: la proportion de train sur l'ensemble des queries

        :type verbose: boolean
        :param verbose: boolean pour activer le mode verbeux
        """

        collection = Parser.Parser.buildDocCollectionSimple(fichier + '.txt',
                                                            pageRank=True)
        self.collectionQry = Parser.Parser.buildQueryCollection(fichier)
        self.train = dict()
        self.verbose = verbose

        self.print_verbose("Initialisation")

        if findParametre:
            self.separeTrainTest(tailleTrain)

        index = IndexerSimple.IndexerSimple()
        index.indexation(collection)

        self.weighter = [
            Weighter.Weighter1(index),
            Weighter.Weighter2(index),
            Weighter.Weighter3(index),
            Weighter.Weighter4(index),
            Weighter.Weighter5(index)
        ]

        modelIR = [IRModel.Vectoriel, IRModel.Jelinek_Mercer, IRModel.Okapi]

        model = []

        for w in self.weighter:
            for m in range(len(modelIR)):

                if m == 1:  # pour le modèle Jelinek_Mercer
                    jelinek = modelIR[m](w)
                    if findParametre:
                        jelinek.findParametreOptimaux(np.arange(0, 1.4, 0.1),
                                                      self.train, metrique)
                    model.append(jelinek)
                elif m == 2:  # pour le modèle Okapi
                    okapi = modelIR[m](w)
                    if findParametre:
                        okapi.findParametreOptimaux(np.arange(0, 0.5, 0.1),
                                                    np.arange(1.5, 2, 0.1),
                                                    self.train, metrique)
                    model.append(okapi)
                else:  # pour le modèle Vectoriel
                    model.append(
                        modelIR[m](w))  # il n'y a pas de parametre a optimiser
                    model.append(modelIR[m](w, True))

        if findParametre:
            self.print_verbose(
                "Tout les models Jelinek_Mercer et Okapi sont entrainer")
        self.model = []

        for m in model:
            self.model.append(EvalIRModel(self.collectionQry, m))

            pr = PageRank.PageRank(m.getWeighter(), m)
            if findParametre:
                pr.findParametreOptimaux(np.arange(0.85, 0.95, 0.05),
                                         self.train, metrique)

            self.model.append(EvalIRModel(self.collectionQry, pr))
        if findParametre:
            self.print_verbose("Tout les models PageRank sont entrainer")

        self.print_verbose("Initialisation terminer")
def leave_one_out(function, diseaseGeneFilePath, PPI_Network, param):
    print("Starting leaveOneOut function")

    # building list of disease genes
    diseaseGeneFile = open(diseaseGeneFilePath, 'r')
    allDiseaseGenes = diseaseGeneFile.read().splitlines()
    diseaseGeneFile.close()
    # print(allDiseaseGenes)
    numDiseaseGenes = len(allDiseaseGenes)
    rankThreshhold = 150

    numGenesNotFound = 0

    degree_list = []  #remove after graph is made (kate)
    in_out_list = []  #remove after graph is made (kate)

    graph_nodes = list(PPI_Network.nodes())
    startVector = load_start_vector(diseaseGeneFilePath, PPI_Network)
    startVector = (numDiseaseGenes / (numDiseaseGenes - 1)) * startVector
    # skipping
    for index, skipGene in enumerate(allDiseaseGenes):

        # find the skip gene in the start vector, make it zero
        index = graph_nodes.index(skipGene)
        node_degree = PPI_Network.degree(
            skipGene)  #remove after graph is made (kate)
        degree_list.append(node_degree)  #remove after graph is made (kate)
        newStartVector = startVector.copy()
        newStartVector[index] = 0
        #   startVector[index] = 0
        priors_vector = np.zeros(PPI_Network.number_of_nodes())
        if function == pr.page_rank:
            priors_file_path = find_priors_file(diseaseGeneFilePath)
            priors_vector = pr.load_priors(priors_file_path, PPI_Network)
            priors_vector[index] = 0

        #run algorithm using modified disease gene file
        startTime = time.time()
        output = []
        if function == pr.page_rank:
            output = function(PPI_Network, newStartVector, priors_vector,
                              param)
        else:
            print("sum of start vector:", np.sum(startVector))
            output = function(PPI_Network, newStartVector, param)
        endTime = time.time()
        print("finished algorithm. Time elapsed:", endTime - startTime)

        #find the predicted probability of the omitted gene and add it to the current sum
        startTime = time.time()
        foundGene = False
        for i in range(rankThreshhold):
            if output[i][0] == skipGene:
                foundGene = True
                print("Found the gene: ", skipGene, "at rank: ", i)
                in_out_list.append(1)  #remove after graph is made (kate)
                break
        if not foundGene:
            numGenesNotFound += 1
            in_out_list.append(-1)  #remove after graph is made (kate)

        endTime = time.time()

    # write the results of leave one out to a file
    disease_name = diseaseGeneFilePath.split(".")[0]
    output_name = "leave_one_out_1" + disease_name[5:]
    if function == pr.page_rank:
        output_name = output_name + "_pr.tsv"
    elif function == dk.diffusion_kernel:
        output_name = output_name + "_dk.tsv"
    elif function == rwr.random_walk:
        output_name = output_name + "_rwr.tsv"
    with open(output_name, "w") as output:
        for i in range(len(allDiseaseGenes)):
            output_string = allDiseaseGenes[i] + "\t" + str(
                degree_list[i]) + "\t" + str(in_out_list[i]) + "\n"
            output.write(output_string)

    print(
        "------------------------\nFinished running algorithm with all disease genes left out\nCalculating mean squared difference"
    )
    print("Num genes not found for this run of leave one out: ",
          numGenesNotFound)
    #Find average of all squared differences
    percentCorrectlyRankedGenes = 1 - numGenesNotFound / numDiseaseGenes
    return percentCorrectlyRankedGenes
def graphAnalyzer(graph, kmeans=False):
    """Argument: the path to find a .gml graph file, boolean : if yes using scikit-learn KMean to cluster otherwise
    using our dbscan algorithm.
    Will page rank and cluster the nodes in order to return the highest page rank page in the three biggest clusters
    It also print a graph in order to visualize the clustering"""
    G = nx.read_gml(graph)
    G = removeIsolatedNodes(G)  # removing meaningless nodes
    G.remove_node(list(G.nodes)[0])

    # ----------------------------------- PageRank Computation --------------------------------------

    # creating a PageRank object
    pr = PageRank.PageRank(G)
    pr.constructDispersionMatrix(G)
    pr = pr.getPageRank()

    # ----------------------------------- Clustering Computation --------------------------------------

    # constructing network layout
    forceatlas2 = fa2.ForceAtlas2(
        # Behavior alternatives
        outboundAttractionDistribution=False,  # Dissuade hubs
        linLogMode=False,  # NOT IMPLEMENTED
        adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
        edgeWeightInfluence=0,

        # Performance
        jitterTolerance=.01,  # Tolerance
        barnesHutOptimize=True,
        barnesHutTheta=1.2,
        multiThreaded=False,  # NOT IMPLEMENTED

        # Tuning
        scalingRatio=1,
        strongGravityMode=True,
        gravity=200,
        # Log
        verbose=True)

    pos = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=1000)

    if kmeans:
        # converting positions into a list of np.array
        pos_list = [np.array([elt[0], elt[1]]) for key, elt in pos.items()]

        # clustering the nodes according to the kmeans algorithm
        clusters = Kmeans.kmeans(pos_list, 8, 0.01, 300)

    else:
        pos = {key: np.array([elt[0], elt[1]]) for key, elt in pos.items()}
        pos_transf = dbscan.transf(
            pos)  # changing position format to be able to use it in DBSCAN
        clusters = dbscan.dbscan(pos_transf, 40, 20)  # clustering

    cluster_with_pr = associatingPageRankToNode(pr, clusters)

    # sorting each cluster according to page rank result
    for key, value in cluster_with_pr.items():
        cluster_with_pr[key] = sorted(value,
                                      key=lambda item: (item[1], item[0]))

    # rendering the suggested pages and their page rank
    print("\nThe recommanded pages are the following :")
    for key, value in cluster_with_pr.items():
        try:
            node_index = value[-1][0]  # retrieving the node index
            title_node = re.search(
                r'titles=(.*?)\&',
                list(G.nodes())
                [node_index])  # getting the title of the Wikipedia page
            print("•", title_node.group(1), "- with a page rank of ",
                  value[-1][1])
        except IndexError:
            pass

    # ----------------------------------- Graph Creation --------------------------------------

    # each node within a cluster have the same color
    get_colors = lambda n: list(
        map(lambda i: "#" + "%06x" % random.randint(0, 0xFFFFFF), range(n)))

    colors = get_colors(len(clusters.keys()) + 1)
    node_color = ['black' for _ in range(len(G.nodes()))]
    for key, value in clusters.items():
        for elt in value:
            node_color[elt] = colors[key]

    nx.draw(G.to_undirected(),
            pos,
            node_size=2,
            width=.05,
            edge_color='grey',
            node_color=node_color)
    plt.savefig("graph_with_layout.png")
if __name__ == '__main__':

    if len(sys.argv) != 2:
        print "usage: ./TopicSensitivePageRank.py <file to read json data(tweets)>"
        sys.exit(1)

    filename = sys.argv[1]
    f = file(filename, "r")
    tweets = f.readlines()

    #final_list = findMostPopularWords(tweets)

    dict_buckets = staticDictOfBuckets()
    tagged_dict = taggedUsers_dict(tweets, dict_buckets)
    #print tagged_dict
    pageRank = PageRank()

    global_dict = pageRank.create_dictionary(tweets)
    idToUserMap = pageRank.map_IDtoUsername(tweets)
    updated_dict = update_dictionary(global_dict, tagged_dict)

    for tag in updated_dict:
        print tag, len(updated_dict[tag])

    #print updated_dict_social
    final_dict_social = pageRank.update_dictionary(updated_dict['technology'])
    print final_dict_social

    final_list_social = sorted(final_dict_social.items(),
                               key=lambda x: x[1],
                               reverse=True)
Пример #26
0
__author__ = 'Ariel'

import numpy as np
import time
import readHelper
import writeHelper
import PageRank


start_time = time.time()

# get teleportation matrix
m = readHelper.getSparseMatrix('transition.txt',True)

# global PageRank
globalPR, outGPR = PageRank.pagerank(m, 0.1)

# out-line link injection for topic sensitive PageRank
topic = readHelper.getSparseMatrix('doc-topics.txt', False).transpose()
tspr, outTSPR = PageRank.topicSensitivePageRank(m, topic, 0.25, 0.65)

# query topic sensitive PageRank
queryTopic, queryDistr = readHelper.getDistro('query-topic-distro.txt')
outQTSPR = PageRank.OnlineTopicSensitivePR(outTSPR, queryDistr[queryTopic[(2,2)]])
queryTopicPR = PageRank.OnlineTopicSensitivePR(tspr, queryDistr)

# user topic sensetive PageRank
userTopic, userDistr = readHelper.getDistro('user-topic-distro.txt')
outPTSPR = PageRank.OnlineTopicSensitivePR(outTSPR, userDistr[userTopic[(2,2)]])
userTopicPR = PageRank.OnlineTopicSensitivePR(tspr, userDistr)
Пример #27
0
if test_matrix:
    P = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                  [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])
    node_count = P.shape[0]
    x = np.ones(node_count)
    weights, iter_count = PageRank.power_iter_matrix(x, graph=P, epsilon=1e-20, d=0.85, max_iter=1000)

    print("Weights:")
    print(weights)
    print("Iteration count:")
    print(iter_count)
else:

    P = None
    # P = {(1,2), (2,1), (3,0), (3,1), (4,3), (4,1), (4,5), (5,1), (5,4), (6,1), (6,4), (7,1), (7,4), (8,1), (8,4), (9,4), (10,4)}
    # node_count = 11
    with open(file_name, 'r') as f:
        line_count = sum(1 for line in f)
        P = np.zeros(line_count, dtype='int32, int32')
        f.seek(0)
        for i, line in enumerate(f):
Пример #28
0
def summarise(filepath,
              co_ref=1,
              page_rank=True,
              debug_output=True,
              num_words=200,
              overlap=True):
    if not os.path.isdir("stanford-corenlp"):
        print >> sys.stderr, "Please put the Stanford CoreNLP package into the stanford-corenlp directory."
        quit()

    filename = filepath.split("/")[-1]
    if not os.path.isfile("stanford-corenlp/" + filename + ".xml"):
        shutil.copyfile(filepath, "stanford-corenlp/" + filename)
        if os.name == "nt":
            os.system(
                "cd stanford-corenlp && java -cp stanford-corenlp-3.2.0.jar;stanford-corenlp-3.2.0-models.jar;xom.jar;joda-time.jar;jollyday.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -file "
                + filename)
        else:
            os.system(
                "cd stanford-corenlp && java -cp stanford-corenlp-3.2.0.jar:stanford-corenlp-3.2.0-models.jar:xom.jar:joda-time.jar:jollyday.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -file "
                + filename)

    sentences, coref = splitAndParse.splitSentencesAndParse(
        "stanford-corenlp/" + filename + ".xml")

    if debug_output:
        print "coref", coref

    adjMax = [[0 for s in sentences] for s in sentences]

    if co_ref:
        for co in coref:
            dic = {}
            for s in co:
                if s in dic:
                    dic[s] += 1
                else:
                    dic[s] = 1
            for s in dic:
                for ss in dic:
                    if s != ss:
                        if co_ref == 1:
                            adjMax[s][ss] += (dic[s] + 0.0) / dic[ss]
                        elif co_ref == 2:
                            adjMax[s][ss] += (dic[ss] + 0.0) / dic[s]
                        elif co_ref == 3:
                            adjMax[s][ss] += dic[ss] + dic[s]
                        elif co_ref == 4:
                            adjMax[s][ss] += dic[ss] * dic[s]
                        elif co_ref == 5:
                            adjMax[s][ss] += (dic[ss] + dic[s]) * 2
                        elif co_ref == 6:
                            adjMax[s][ss] += (dic[ss] + dic[s]) * 5

    words_used = set()

    if debug_output:
        print "before overlap"
        for m in adjMax:
            print " ".join([str(i) for i in m])

    if overlap:
        for s in range(len(sentences)):
            for ss in range(len(sentences)):
                if s != ss:
                    adjMax[s][ss] += findOverlap(sentences[s], sentences[ss],
                                                 words_used)

    if debug_output:
        print "after overlap"
        for m in adjMax:
            print " ".join([str(i) for i in m])
    l = []
    scores = []

    if page_rank:
        G = np.array(adjMax)
        scores = PageRank.zeroToOne(G, s=0.5)
    else:
        scores = [sum(row) for row in adjMax]

    ind = 0
    for s in scores:
        l.append((s, ind))
        ind += 1

    l.sort()
    l.reverse()
    if debug_output:
        print words_used
        # print all inportant sentences
        for k in l:
            print sentences[k[1]]
            print k[0]
            print "==============="

    best_first = cutoff_words(l, sentences, 200, adjMax)
    by_order = []
    for i in range(0, len(best_first)):
        by_order.append((l[i][1], best_first[i]))
    by_order.sort()
    return [s[1] for s in by_order]
Пример #29
0
from PageRank import *
import time

start_time = time.time()
##data_file = "Datasets/sx-mathoverflow.txt"
data_file = "Datasets/Wiki-Vote.txt"
##data_file = "Datasets/test.txt"
is_page_no_zero_indexed = False
epsilon = 0.00001
max_iterations = 10
beta = 0.85
display_network_after_each_iteration = True
max_no_of_nodes_to_show = 20

PgRank = PageRank(data_file, is_page_no_zero_indexed, max_iterations, beta, epsilon, display_network_after_each_iteration)
print("Rank Vector:")
for i in PgRank.rank_vector[:max_no_of_nodes_to_show]:
    print(i)
    
PgRank.display_network(PgRank.rank_vector, max_no_of_nodes_to_show)    
    
if(is_page_no_zero_indexed):
    teleport_set = [i for i in PgRank.matrix if i%500==0]
else:
    teleport_set = [i+1 for i in PgRank.matrix if i%500==0]
topic_specific_rank_vector = PgRank.topic_specific_page_rank(teleport_set)
print("Topic Specific Rank Vector:")
for i in topic_specific_rank_vector[:max_no_of_nodes_to_show]:
    print(i)

end_time = time.time()
Пример #30
0
 def testmethod_1(self):
     result = PageRank.search("youtube")
     self.assertNotEqual(len(result),0,"fail")
Пример #31
0
listS5, best_avg5 = clique.find_densest_subgraph(G)
end5 = time.clock()
time5 = end5 - beg5
L4 = []
for x in range(0, len(listS5)):
    for y in listS5[x]:
        L4.append(y)
L4 = list(set(L4))
#print L4
recall_Clique = len(list(golden_keywords.intersection(L4))) / float(
    len(golden_keywords))
precision_Clique = len(list(golden_keywords.intersection(L4))) / float(len(L4))

# PageRank
beg7 = time.clock()
list7 = pr.find_densest_subgraph(G, 10)
end7 = time.clock()
time7 = end7 - beg7
temp = [list7[i][0] for i in range(len(list7))]
recall_PageRank = len(list(golden_keywords.intersection(temp))) / float(
    len(golden_keywords))
precision_PageRank = len(list(golden_keywords.intersection(temp))) / float(
    len(temp))

#Using the function from Networkx K-Cores
beg4 = time.clock()
G.remove_edges_from(G.selfloop_edges())
S4 = nx.k_core(G)
end4 = time.clock()
time4 = end4 - beg4
temp = S4.nodes()
Пример #32
0
def test_influence():

    resIGFG = DetectionCommunautes.DetectionComIG(gTwitter, "FastGreedy")
    resIGIM = DetectionCommunautes.DetectionComIG(gTwitter, "InfoMap")
    resSNAPBC = DetectionCommunautes.DetectionComSNAP(gTwitter, "BigClam",
                                                      path + 'snap/', path,
                                                      'outputGraph.txt',
                                                      'outputAlgo.txt')
    resSNAPIM = DetectionCommunautes.DetectionComSNAP(gTwitter, "InfoMap",
                                                      path + 'snap/', path,
                                                      'outputGraph.txt',
                                                      'outputAlgo.txt')
    resSNAPCPM = DetectionCommunautes.DetectionComSNAP(gTwitter, "CPM",
                                                       path + 'snap/', path,
                                                       'outputGraph.txt',
                                                       'outputAlgo.txt')

    NomComFG = InfluenceCommunautes.NomsCommunautes(resIGFG['membership'],
                                                    gTwitter)
    NomComIM = InfluenceCommunautes.NomsCommunautes(resIGIM['membership'],
                                                    gTwitter)

    assert type(NomComFG['comNoms']) == dict
    assert type(NomComFG['comNode']) == dict
    assert len(NomComFG['comNoms']) != 0
    assert len(NomComFG['comNode']) != 0
    assert type(NomComIM['comNoms']) == dict
    assert type(NomComIM['comNode']) == dict
    assert len(NomComIM['comNoms']) != 0
    assert len(NomComIM['comNode']) != 0

    IGgraph1=PageRank.getIGraph("MATCH (n:`andra-user`)<-[r:`andra-from`]-(t1:`andra-tweet`)-[re:`andra-retweet`]->(t2:`andra-tweet`)-[rel:`andra-from`]->(p:`andra-user`) WHERE t2.language="'"fr"'" RETURN n as nodeFrom,p as nodeTo",\
            grapheNeo,"192.168.1.75:7474","neo4j", "pass4dbse")
    PRank1 = PageRank.Rank(IGgraph1, 10)

    InfluFG = InfluenceCommunautes.InfluenceCommunautes(
        grapheNeo, gTwitter, PRank1['resPR'], NomComFG['comNode'], 0.2, 10)
    InfluIM = InfluenceCommunautes.InfluenceCommunautes(
        grapheNeo, gTwitter, PRank1['resPR'], NomComIM['comNode'], 0.2, 10)
    InfluBC = InfluenceCommunautes.InfluenceCommunautes(
        grapheNeo, gTwitter, PRank1['resPR'], resSNAPBC['comCodes'], 0.2, 10)
    InfluIMS = InfluenceCommunautes.InfluenceCommunautes(
        grapheNeo, gTwitter, PRank1['resPR'], resSNAPIM['comCodes'], 0.2, 10)
    InfluCPM = InfluenceCommunautes.InfluenceCommunautes(
        grapheNeo, gTwitter, PRank1['resPR'], resSNAPCPM['comCodes'], 0.2, 10)

    assert type(InfluFG) == dict
    assert type(InfluIM) == dict
    assert type(InfluBC) == dict
    assert type(InfluIMS) == dict
    assert type(InfluCPM) == dict

    InfluHTFG = InfluenceCommunautes.InfluenceHashtags(grapheNeo,
                                                       NomComFG['comNode'],
                                                       0.2)
    InfluHTIM = InfluenceCommunautes.InfluenceHashtags(grapheNeo,
                                                       NomComIM['comNode'],
                                                       0.2)
    InfluHTBC = InfluenceCommunautes.InfluenceHashtags(grapheNeo,
                                                       resSNAPBC['comCodes'],
                                                       0.2)
    InfluHTIMS = InfluenceCommunautes.InfluenceHashtags(
        grapheNeo, resSNAPIM['comCodes'], 0.2)
    InfluHTCPM = InfluenceCommunautes.InfluenceHashtags(
        grapheNeo, resSNAPCPM['comCodes'], 0.2)

    assert type(InfluHTFG) == dict
    assert type(InfluHTIM) == dict
    assert type(InfluHTBC) == dict
    assert type(InfluHTIMS) == dict
    assert type(InfluHTCPM) == dict
    assert len(InfluHTFG) == len(NomComFG['comNode'])
    assert len(InfluHTIM) == len(NomComIM['comNode'])
    assert len(InfluHTBC) == len(resSNAPBC['comNoms'])
    assert len(InfluHTIMS) == len(resSNAPIM['comNoms'])
    assert len(InfluHTCPM) == len(resSNAPCPM['comNoms'])

    InflueTweeFG = InfluenceCommunautes.InfluenceTweets(NomComFG['comNode'])
    InflueTweeIM = InfluenceCommunautes.InfluenceTweets(NomComIM['comNode'])
    InflueTweeBC = InfluenceCommunautes.InfluenceTweets(resSNAPBC['comCodes'])
    InflueTweeIMS = InfluenceCommunautes.InfluenceTweets(resSNAPIM['comCodes'])
    InflueTweeCPM = InfluenceCommunautes.InfluenceTweets(
        resSNAPCPM['comCodes'])

    assert type(InflueTweeFG) == dict
    assert type(InflueTweeIM) == dict
    assert type(InflueTweeBC) == dict
    assert type(InflueTweeIMS) == dict
    assert type(InflueTweeCPM) == dict
    assert len(InflueTweeFG) == len(NomComFG['comNode'])
    assert len(InflueTweeIM) == len(NomComIM['comNode'])
    assert len(InflueTweeBC) == len(resSNAPBC['comNoms'])
    assert len(InflueTweeIMS) == len(resSNAPIM['comNoms'])
    assert len(InflueTweeCPM) == len(resSNAPCPM['comNoms'])

    print "INFLUENCE OK"
Пример #33
0
        meanDegree = (2 * numberOfEdges) / numberOfNodes
        return meanDegree / (numberOfNodes - 1)


#deleted 5,6,7,11
#8=5, 9=6, 10=7, 12=8, 13=9, 14=10, 15=11, 16=12, 17=13, 18=14
Friends = [[2], [1, 4, 9, 11], [4], [2, 3], [6], [5], [8], [7, 9, 10],
           [2, 8, 10], [8, 9], [2, 12], [11], [14], [13]]

#deleted 5,6,7,11
# 8=5,9=6,10=7,12=8,13=9,14=10,15=11,16=12,17=13,18=14
FirstNames = [[2, 4, 8], [1, 4, 9, 11], [4], [1, 2, 3, 13, 14], [6], [5],
              [8, 13], [1, 7, 9, 10], [2, 8, 10], [8, 9, 14], [2, 12], [11],
              [4, 7, 14], [4, 10, 13]]

#deleted 1,5,8,9,11,15,16
# 2=1, 3=2, 4=3, 6=4, 7=5, 10=6, 12=7, 13=8, 14=9, 17=10, 18=11
HaveClass = [[3, 8], [3], [1, 2, 10, 11], [5], [4], [7], [6, 8], [1, 7, 11],
             [11], [3, 11], [3, 8, 9, 10]]

#deleted 3,5,6,7,11,14,15,16
# 4=3,8=4,9=5,10=6,12=7,13=8,17=9,18=10
SocialEvents = [[2], [1], [9], [5], [4], [7, 9], [6, 8], [7], [3, 6, 10], [9]]

#print(HW1.out_degree(Friends))
#print(HW1.in_degree(Friends))
PR.PageRank(Friends)
PR.PageRank(FirstNames)
PR.PageRank(HaveClass)
PR.PageRank(SocialEvents)
Пример #34
0
 def testmethod_6(self):
     result = PageRank.search("ruosyguweryiotgryu")
     self.assertEqual(len(result), 1, "fail")
Пример #35
0
 def testmethod_3(self):
     result = PageRank.search("shit")
     self.assertNotEqual(len(result),0,"fail")
Пример #36
0
    print("Loading tf-idf")
    tfidf = calculateTFIDF(invertedIndex, maxCount, N)

    #     doIndexingTfIDF(tfidf)
    #     doIndexingPreprocessedWord()
    #     doIndexingDocuments()
    #     doIndexingPR()

    #     tfidf = fetchIndexingTfIDF(tfidf)
    #     cleanWords = fetchIndexingPreprocessedWord()
    #     docCounterList = fetchIndexingDocuments()
    #     pageranks = fetchIndexingPR()

    # calculate pagerank
    print("Loading pagerank")
    pageranks = pr.compute(linksDocs)

    # load any embeddings of choice
    print("Loading word embeddings")
    embeddings_dict = utils.load_glove()
    #     embeddings_dict = gensim.models.KeyedVectors.load_word2vec_format('Embeddings\GoogleNews-vectors-negative300.bin', binary=True)

    #list of predefined queries
    queries = [
        "Professors who teach NLP at UIC", "Student organizations at uic",
        "Student Orientation at UIC", "How has coronavirus affected UIC",
        "Centers for Cultural Understanding and Social Change"
    ]
    # press enter to validate pre-defined queries
    # otherwise input your query
    query = input(
Пример #37
0
 def testmethod_5(self):
     result = PageRank.search("") 
     self.assertEqual(len(result), 1, "fail") 
Пример #38
0
def main():
    print("Starting AUROC..")
    #Get file path choices
    pathToPPINetworkFile = sys.argv[1]
    #pathToPPINetworkFile = 'Data/9606.protein.links.v11.0.txt'

    # Get output vectors from each algorithm

    PPI_Network = compute_if_not_cached(loader.load_graph,
                                        pathToPPINetworkFile,
                                        fileName=pathToPPINetworkFile)
    ground_truth_files = [
        'Data/MalaCard-protein-Endometriosis.diseasegenes.tsv',
        'Data/MalaCard-protein-ischaemic-stroke.diseasegenes.tsv',
        'Data/MalaCard-protein-lymphoma.diseasegenes.tsv'
    ]
    file_paths = [
        'Data/endometriosis-proteins.diseasegenes.tsv',
        'Data/lymphoma-proteins.diseasegenes.tsv',
        'Data/ischaemic-proteins.diseasegenes.tsv'
    ]
    prior_paths = [
        'Data/endometriosis-proteins.priors.tsv',
        'Data/lymphoma-proteins.priors.tsv',
        'Data/ischaemic-proteins.priors.tsv'
    ]
    names = ['endometriosis', 'lymphoma', 'ischaemic']

    for i in range(1, 3):
        # building ground truth
        ground_truth_vec = []
        with open(ground_truth_files[i], 'r') as input_file:
            input_file = input_file.readlines()
            for line in input_file:
                protein = line.rstrip('\n')
                ground_truth_vec.append(protein)
        gene_file = open(file_paths[i], 'r')
        file_contents = list(gene_file.readlines())
        # print(file_contents)
        for line in file_contents:
            protein = line.rstrip('\n')
            if protein not in ground_truth_vec:
                ground_truth_vec.append(protein)
        gene_file.close()
        print(ground_truth_vec)
        # building start and priors vector

        start_vector = loader.load_start_vector(file_paths[i], PPI_Network)
        priors_vector = pr.load_priors(prior_paths[i], PPI_Network)

        #getting output from algorithms
        start_time = time.time()
        output_RWR = rwr.random_walk(PPI_Network, start_vector)
        end_time = time.time()
        print("time for rwr:", end_time - start_time)
        start_time = time.time()
        output_PR = pr.page_rank(PPI_Network, start_vector, priors_vector)
        end_time = time.time()
        print("time for pr:", end_time - start_time)

        start_time = time.time()
        output_DK = dk.diffusion_kernel(PPI_Network, start_vector)
        end_time = time.time()
        print("time for dk:", end_time - start_time)

        #building roc curves

        start_time = time.time()
        name = "rwr-" + names[i]
        rwr_curve = roc_curve(output_RWR, ground_truth_vec, name)
        end_time = time.time()
        print("time for roc curve, rwr:", end_time - start_time)

        start_time = time.time()
        name = "pr-" + names[i]
        pr_curve = roc_curve(output_PR, ground_truth_vec, name)
        end_time = time.time()
        print("time for roc curve, pr:", end_time - start_time)
        start_time = time.time()

        start_time = time.time()
        name = "dk-" + names[i]
        dk_curve = roc_curve(output_DK, ground_truth_vec, name)
        end_time = time.time()
        print("time for roc curve, dk:", end_time - start_time)
        file_path = 'Results/' + names[i] + 'roc_curve.png'
        plt.ylabel('TPR')
        plt.xlabel('FPR')
        plt.title(names[i])
        plt.legend(loc='lower right')
        plt.savefig(file_path)  #moved from roc_curve
        plt.clf()  #moved from roc_curve
        print("Plots have been saved as png files in the Results folder.")